LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - txtsim.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 537 835 64.3 %
Date: 2024-04-25 20:03:45 Functions: 28 33 84.8 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : #include "monetdb_config.h"
      14             : #include "gdk.h"
      15             : #include "str.h"
      16             : #include "mal.h"
      17             : #include "mal_exception.h"
      18             : #include "mal_interpreter.h"
      19             : #include <string.h>
      20             : #include <limits.h>
      21             : 
      22             : #define MIN3(X,Y,Z)   (MIN(MIN((X),(Y)),(Z)))
      23             : #define MAX3(X,Y,Z)   (MAX(MAX((X),(Y)),(Z)))
      24             : #define MIN4(W,X,Y,Z) (MIN(MIN(MIN(W,X),Y),Z))
      25             : 
      26             : static void
      27          15 : reclaim_bats(int nargs, ...)
      28             : {
      29          15 :         va_list valist;
      30             : 
      31          15 :         va_start(valist, nargs);
      32          90 :         for (int i = 0; i < nargs; i++) {
      33          75 :                 BAT *b = va_arg(valist, BAT *);
      34         120 :                 BBPreclaim(b);
      35             :         }
      36          15 :         va_end(valist);
      37          15 : }
      38             : 
      39             : static inline int *
      40        4178 : damerau_get_cellpointer(int *pOrigin, int col, int row, int nCols)
      41             : {
      42        4178 :         return pOrigin + col + (row * (nCols + 1));
      43             : }
      44             : 
      45             : static inline int
      46        2302 : damerau_getat(int *pOrigin, int col, int row, int nCols)
      47             : {
      48        2302 :         int *pCell;
      49        2302 :         pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
      50        2302 :         return *pCell;
      51             : }
      52             : 
      53             : static inline void
      54        1876 : damerau_putat(int *pOrigin, int col, int row, int nCols, int x)
      55             : {
      56        1876 :         int *pCell;
      57        1876 :         pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
      58        1876 :         *pCell = x;
      59             : }
      60             : 
      61             : static str
      62          83 : dameraulevenshtein(int *res, str *S, str *T, int insdel_cost, int replace_cost,
      63             :                                    int transpose_cost)
      64             : {
      65          83 :         char *s = *S;
      66          83 :         char *t = *T;
      67          83 :         int *d;                                         /* pointer to matrix */
      68          83 :         int n;                                          /* length of s */
      69          83 :         int m;                                          /* length of t */
      70          83 :         int i;                                          /* iterates through s */
      71          83 :         int j;                                          /* iterates through t */
      72          83 :         char s_i;                                       /* ith character of s */
      73          83 :         char t_j;                                       /* jth character of t */
      74          83 :         int cost;                                       /* cost */
      75          83 :         int cell;                                       /* contents of target cell */
      76          83 :         int above;                                      /* contents of cell immediately above */
      77          83 :         int left;                                       /* contents of cell immediately to left */
      78          83 :         int diag;                                       /* contents of cell immediately above and to left */
      79          83 :         lng sz;                                 /* number of cells in matrix */
      80          83 :         int diag2 = 0, cost2 = 0;
      81             : 
      82         159 :         if (strNil(*S) || strNil(*T)) {
      83          10 :                 *res = int_nil;
      84          10 :                 return MAL_SUCCEED;
      85             :         }
      86             : 
      87             :         /* Step 1 */
      88          73 :         n = (int) strlen(s);            /* 64bit: assume strings are less than 2 GB */
      89          73 :         m = (int) strlen(t);
      90          73 :         if (n == 0) {
      91           8 :                 *res = m;
      92           8 :                 return MAL_SUCCEED;
      93             :         }
      94          65 :         if (m == 0) {
      95           8 :                 *res = n;
      96           8 :                 return MAL_SUCCEED;
      97             :         }
      98          57 :         sz = (((lng)n) + 1) * (((lng)m) + 1) * sizeof(int);
      99          57 :         if (sz > (LL_CONSTANT(1)<<28))
     100           1 :                 throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     101          56 :         d = (int *) GDKmalloc((size_t)sz);
     102          56 :         if (d == NULL)
     103           0 :                 throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     104             : 
     105             :         /* Step 2 */
     106         346 :         for (i = 0; i <= n; i++) {
     107         290 :                 damerau_putat(d, i, 0, n, i);
     108             :         }
     109         324 :         for (j = 0; j <= m; j++) {
     110         268 :                 damerau_putat(d, 0, j, n, j);
     111             :         }
     112             :         /* Step 3 */
     113         290 :         for (i = 1; i <= n; i++) {
     114         234 :                 s_i = s[i - 1];
     115             :                 /* Step 4 */
     116        1552 :                 for (j = 1; j <= m; j++) {
     117        1318 :                         t_j = t[j - 1];
     118             :                         /* Step 5 */
     119        1318 :                         if (s_i == t_j) {
     120             :                                 cost = 0;
     121             :                         } else {
     122        1002 :                                 cost = replace_cost;
     123             :                         }
     124             :                         /* Step 6 */
     125        1318 :                         above = damerau_getat(d, i - 1, j, n);
     126        1318 :                         left = damerau_getat(d, i, j - 1, n);
     127        1318 :                         diag = damerau_getat(d, i - 1, j - 1, n);
     128        1318 :                         if (j >= 2 && i >= 2) {
     129             :                                 /* NEW: detect transpositions */
     130         928 :                                 diag2 = damerau_getat(d, i - 2, j - 2, n);
     131         928 :                                 if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
     132             :                                         cost2 = transpose_cost;
     133             :                                 } else {
     134         856 :                                         cost2 = 2;
     135             :                                 }
     136         928 :                                 cell = MIN4(above + insdel_cost, left + insdel_cost,
     137             :                                                         diag + cost, diag2 + cost2);
     138             :                         } else {
     139         390 :                                 cell = MIN3(above + insdel_cost, left + insdel_cost,
     140             :                                                         diag + cost);
     141             :                         }
     142        1318 :                         damerau_putat(d, i, j, n, cell);
     143             :                 }
     144             :         }
     145             :         /* Step 7 */
     146          56 :         *res = damerau_getat(d, n, m, n);
     147          56 :         GDKfree(d);
     148          56 :         return MAL_SUCCEED;
     149             : }
     150             : 
     151             : static str
     152          39 : TXTSIMdameraulevenshtein1(int *result, str *s, str *t)
     153             : {
     154           9 :         return dameraulevenshtein(result, s, t, 1, 1, 2);
     155             : }
     156             : 
     157             : static str
     158           9 : TXTSIMdameraulevenshtein2(int *result, str *s, str *t)
     159             : {
     160           9 :         return dameraulevenshtein(result, s, t, 1, 1, 1);
     161             : }
     162             : 
     163             : static str
     164           2 : TXTSIMdameraulevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
     165             :                                                  InstrPtr pci)
     166             : {
     167           2 :         (void) cntxt;
     168           2 :         (void) mb;
     169           2 :         int *res = getArgReference_int(stk, pci, 0);
     170           2 :         str *X = getArgReference_str(stk, pci, 1),
     171           2 :                 *Y = getArgReference_str(stk, pci, 2);
     172           2 :         int insdel_cost, replace_cost, transpose_cost;
     173             : 
     174           2 :         assert(pci->argc == 3 || pci->argc == 6);
     175             : 
     176           2 :         if (pci->argc == 3) {
     177             :                 insdel_cost = 1;
     178             :                 replace_cost = 1;
     179             :                 transpose_cost = 2;
     180             :         } else {
     181           1 :                 insdel_cost = *getArgReference_int(stk, pci, 3);
     182           1 :                 replace_cost = *getArgReference_int(stk, pci, 4);
     183           1 :                 transpose_cost = *getArgReference_int(stk, pci, 5);
     184             :         }
     185             : 
     186           2 :         return dameraulevenshtein(res, X, Y, insdel_cost, replace_cost,
     187             :                                                           transpose_cost);
     188             : }
     189             : 
     190             : static inline str
     191          12 : levenshtein(int *res, const str *X, const str *Y, int insdel_cost,
     192             :                         int replace_cost, int max)
     193             : {
     194          12 :         str x = *X, y = *Y, x_iter = x;
     195          12 :         unsigned int xlen, ylen, i = 0, j = 0;
     196          12 :         unsigned int last_diagonal, old_diagonal;
     197          12 :         int cx, cy;
     198          12 :         unsigned int *column, min;
     199             : 
     200          22 :         if (strNil(*X) || strNil(*Y)) {
     201           3 :                 *res = int_nil;
     202           3 :                 return MAL_SUCCEED;
     203             :         }
     204             : 
     205           9 :         xlen = UTF8_strlen(x);
     206           9 :         ylen = UTF8_strlen(y);
     207             : 
     208           9 :         if (xlen == ylen && (strcmp(x, y) == 0))
     209             :                 return MAL_SUCCEED;
     210             : 
     211           8 :         column = GDKmalloc((xlen + 1) * sizeof(unsigned int));
     212           8 :         if (column == NULL)
     213           0 :                 throw(MAL, "levenshtein", MAL_MALLOC_FAIL);
     214             : 
     215          59 :         for (i = 1; i <= xlen; i++)
     216          51 :                 column[i] = i;
     217             : 
     218          57 :         for (j = 1; j <= ylen; j++) {
     219          49 :                 column[0] = j;
     220          49 :                 min = INT_MAX;
     221          49 :                 x_iter = x;
     222          49 :                 UTF8_GETCHAR(cy, y);
     223         306 :                 for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
     224         257 :                         UTF8_GETCHAR(cx, x_iter);
     225         257 :                         old_diagonal = column[i];
     226         257 :                         column[i] = MIN3(column[i] + insdel_cost,
     227             :                                                          column[i - 1] + insdel_cost,
     228             :                                                          last_diagonal + (cx == cy ? 0 : replace_cost));
     229         257 :                         last_diagonal = old_diagonal;
     230         257 :                         if (last_diagonal < min)
     231             :                                 min = last_diagonal;
     232             :                 }
     233          49 :                 if (max != -1 && min > (unsigned int) max) {
     234           0 :                         *res = INT_MAX;
     235           0 :                         GDKfree(column);
     236           0 :                         return MAL_SUCCEED;
     237             :                 }
     238          49 :                 (*x)++;
     239             :         }
     240             : 
     241           8 :         *res = column[xlen];
     242           8 :         GDKfree(column);
     243           8 :         return MAL_SUCCEED;
     244           0 :   illegal:
     245             :         /* UTF8_GETCHAR bail */
     246           0 :         GDKfree(column);
     247           0 :         throw(MAL, "txtsim.levenshtein", "Illegal unicode code point");
     248             : }
     249             : 
     250             : /* Levenshtein OP but with column externaly allocated */
     251             : static inline int
     252           0 : levenshtein2(const str X, const str Y, const size_t xlen, const size_t ylen,
     253             :                          unsigned int *column, const int insdel_cost,
     254             :                          const int replace_cost, const int max)
     255             : {
     256           0 :         str x = X, y = Y;
     257           0 :         str x_iter = x;
     258           0 :         unsigned int i = 0, j = 0, min;;
     259           0 :         unsigned int last_diagonal, old_diagonal;
     260           0 :         int cx, cy;
     261             : 
     262           0 :         if (strNil(x) || strNil(y))
     263           0 :                 return int_nil;
     264             : 
     265           0 :         if (xlen == ylen && (strcmp(x, y) == 0))
     266             :                 return 0;
     267             : 
     268           0 :         for (i = 1; i <= xlen; i++)
     269           0 :                 column[i] = i;
     270             : 
     271           0 :         for (j = 1; j <= ylen; j++) {
     272           0 :                 column[0] = j;
     273           0 :                 min = INT_MAX;
     274           0 :                 x_iter = x;
     275           0 :                 UTF8_GETCHAR(cy, y);
     276           0 :                 for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
     277           0 :                         UTF8_GETCHAR(cx, x_iter);
     278           0 :                         old_diagonal = column[i];
     279           0 :                         column[i] = MIN3(column[i] + insdel_cost,
     280             :                                                          column[i - 1] + insdel_cost,
     281             :                                                          last_diagonal + (cx == cy ? 0 : replace_cost));
     282           0 :                         last_diagonal = old_diagonal;
     283           0 :                         if (last_diagonal < min)
     284             :                                 min = last_diagonal;
     285             :                 }
     286           0 :                 if (max != -1 && min > (unsigned int) max)
     287             :                         return INT_MAX;
     288           0 :                 (*x)++;
     289             :         }
     290           0 :         return column[xlen];
     291             :   illegal:
     292             :         /* UTF8_GETCHAR bail */
     293             :         return INT_MAX;
     294             : }
     295             : 
     296             : static str
     297          45 : TXTSIMlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     298             : {
     299          45 :         (void) cntxt;
     300          45 :         (void) mb;
     301          45 :         int *res = getArgReference_int(stk, pci, 0);
     302          45 :         str *X = getArgReference_str(stk, pci, 1),
     303          45 :                 *Y = getArgReference_str(stk, pci, 2);
     304          45 :         int insdel_cost, replace_cost;
     305             : 
     306          45 :         if (pci->argc == 3) {
     307             :                 insdel_cost = 1;
     308             :                 replace_cost = 1;
     309          34 :         } else if (pci->argc == 5 || pci->argc == 6) {
     310          34 :                 insdel_cost = *getArgReference_int(stk, pci, 3);
     311          34 :                 replace_cost = *getArgReference_int(stk, pci, 4);
     312             :                 /* Backwards compatibility purposes */
     313          34 :                 if (pci->argc == 6) {
     314          33 :                         int transposition_cost = *getArgReference_int(stk, pci, 5);
     315          33 :                         return dameraulevenshtein(res, X, Y, insdel_cost, replace_cost,
     316             :                                                                           transposition_cost);
     317             :                 }
     318             :         } else {
     319           0 :                 throw(MAL, "txtsim.levenshtein", RUNTIME_SIGNATURE_MISSING);
     320             :         }
     321             : 
     322          12 :         return levenshtein(res, X, Y, insdel_cost, replace_cost, -1);;
     323             : }
     324             : 
     325             : static str
     326           0 : TXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     327             : {
     328           0 :         (void) cntxt;
     329           0 :         (void) mb;
     330           0 :         int *res = getArgReference_int(stk, pci, 0);
     331           0 :         const str *X = getArgReference_str(stk, pci, 1),
     332           0 :                 *Y = getArgReference_str(stk, pci, 2);
     333           0 :         const int *k = getArgReference_int(stk, pci, 3);
     334           0 :         int insdel_cost, replace_cost;
     335             : 
     336           0 :         if (pci->argc == 4) {
     337             :                 insdel_cost = 1;
     338             :                 replace_cost = 1;
     339           0 :         } else if (pci->argc == 6) {
     340           0 :                 insdel_cost = *getArgReference_int(stk, pci, 4);
     341           0 :                 replace_cost = *getArgReference_int(stk, pci, 5);
     342             :         } else {
     343           0 :                 throw(MAL, "txtsim.maxlevenshtein", RUNTIME_SIGNATURE_MISSING);
     344             :         }
     345             : 
     346           0 :         return levenshtein(res, X, Y, insdel_cost, replace_cost, *k);
     347             : }
     348             : 
     349             : static str
     350           0 : BATTXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     351             : {
     352           0 :         (void) cntxt;
     353           0 :         (void) mb;
     354           0 :         bat *res = getArgReference_bat(stk, pci, 0);
     355           0 :         bat *lid = getArgReference_bat(stk, pci, 1);
     356           0 :         bat *rid = getArgReference_bat(stk, pci, 2);
     357           0 :         int *k = getArgReference_int(stk, pci, 3);
     358           0 :         int insdel_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 4) : 1,
     359           0 :                         replace_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 5) : 1;
     360           0 :         BAT *left = NULL, *right = NULL, *bn = NULL;
     361           0 :         BUN p, q;
     362           0 :         BATiter li, ri;
     363           0 :         unsigned int *buffer = NULL;
     364           0 :         str lv, rv, msg = MAL_SUCCEED;
     365           0 :         size_t llen = 0, rlen = 0, maxlen = 0;
     366           0 :         int d;
     367           0 :         bit v;
     368             : 
     369           0 :         if ((left = BATdescriptor(*lid)) == NULL) {
     370           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     371             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     372           0 :                 goto exit;
     373             :         }
     374           0 :         if ((right = BATdescriptor(*rid)) == NULL) {
     375           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     376             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     377           0 :                 goto exit;
     378             :         }
     379           0 :         if (BATcount(left) != BATcount(right)) {
     380           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     381             :                                                           "Columns must be aligned");
     382           0 :                 goto exit;
     383             :         }
     384           0 :         if ((bn = COLnew(0, TYPE_bit, BATcount(left), TRANSIENT)) == NULL) {
     385           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     386             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     387           0 :                 goto exit;
     388             :         }
     389             : 
     390           0 :         li = bat_iterator(left);
     391           0 :         ri = bat_iterator(right);
     392           0 :         BATloop(left, p, q) {
     393           0 :                 lv = (str) BUNtail(li, p);
     394           0 :                 rv = (str) BUNtail(ri, p);
     395           0 :                 llen = UTF8_strlen(lv);
     396           0 :                 rlen = UTF8_strlen(rv);
     397           0 :                 if (abs((int) llen - (int) rlen) > (int) *k)
     398           0 :                         v = false;
     399             :                 else {
     400           0 :                         if (llen > maxlen) {
     401           0 :                                 maxlen = llen;
     402           0 :                                 unsigned int *tmp = GDKrealloc(buffer, (maxlen + 1) * sizeof(unsigned int));
     403           0 :                                 if (tmp == NULL) {
     404           0 :                                         bat_iterator_end(&li);
     405           0 :                                         bat_iterator_end(&ri);
     406           0 :                                         msg = createException(MAL, "battxtsim.maxlevenshtein",
     407             :                                                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
     408           0 :                                         goto exit;
     409             :                                 }
     410             :                                 buffer = tmp;
     411             :                         }
     412           0 :                         d = levenshtein2(lv, rv, llen, rlen, buffer, insdel_cost,
     413             :                                                          replace_cost, (int) *k);
     414           0 :                         v = (bit) (d <= (int) *k);
     415             :                 }
     416           0 :                 if (BUNappend(bn, (const void *) &v, false) != GDK_SUCCEED) {
     417           0 :                         bat_iterator_end(&li);
     418           0 :                         bat_iterator_end(&ri);
     419           0 :                         msg = createException(MAL, "battxtsim.maxlevenshtein",
     420             :                                                                   "BUNappend failed");
     421           0 :                         goto exit;
     422             :                 }
     423             :         }
     424           0 :         bat_iterator_end(&li);
     425           0 :         bat_iterator_end(&ri);
     426             : 
     427           0 :         *res = bn->batCacheid;
     428           0 :         BBPkeepref(bn);
     429           0 :   exit:
     430           0 :         GDKfree(buffer);
     431           0 :         BBPreclaim(left);
     432           0 :         BBPreclaim(right);
     433           0 :         if (msg != MAL_SUCCEED)
     434           0 :                 BBPreclaim(bn);
     435           0 :         return msg;
     436             : }
     437             : 
     438             : #define JARO_WINKLER_SCALING_FACTOR 0.1
     439             : #define JARO_WINKLER_PREFIX_LEN 4
     440             : 
     441             : typedef struct {
     442             :         size_t matches;                         /* accumulator for number of matches for this item */
     443             :         BUN o;                                          /* position in the BAT */
     444             :         str val;                                        /* string value */
     445             :         int *cp_sequence;                       /* string as array of Unicode codepoints */
     446             :         int len;                                        /* string length in characters (multi-byte characters count as 1) */
     447             :         int cp_seq_len;                         /* string length in bytes */
     448             :         uint64_t abm;                           /* 64bit alphabet bitmap */
     449             :         int abm_popcount;                       /* hamming weight of abm */
     450             : } str_item;
     451             : 
     452             : static inline int
     453          95 : popcount64(uint64_t x)
     454             : {
     455          95 :         x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL);
     456          95 :         x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
     457          95 :         x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL);
     458          95 :         return (x * 0x0101010101010101ULL) >> 56;
     459             : }
     460             : 
     461             : static int
     462          45 : str_item_lenrev_cmp(const void *a, const void *b)
     463             : {
     464          45 :         return ((int) ((str_item *) b)->len - (int) ((str_item *) a)->len);
     465             : }
     466             : 
     467             : static str
     468          64 : str_2_codepointseq(str_item *s)
     469             : {
     470          64 :         str p = s->val;
     471          64 :         int c;
     472             : 
     473          64 :         s->cp_sequence = GDKmalloc(s->len * sizeof(int));
     474          64 :         if (s->cp_sequence == NULL)
     475           0 :                 throw(MAL, "str_2_byteseq", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     476             : 
     477         428 :         for (int i = 0; i < s->len; i++) {
     478         364 :                 UTF8_GETCHAR(c, p);
     479         364 :                 if (c == 0)
     480             :                         break;
     481         364 :                 s->cp_sequence[i] = c;
     482             :         }
     483             :         return MAL_SUCCEED;
     484           0 :   illegal:
     485           0 :         throw(MAL, "str_2_byteseq", SQLSTATE(42000) "Illegal unicode code point");
     486             : }
     487             : 
     488             : static void
     489          60 : str_alphabet_bitmap(str_item *s)
     490             : {
     491          60 :         int i;
     492             : 
     493          60 :         s->abm = 0ULL;
     494             : 
     495         400 :         for (i = 0; i < s->len; i++)
     496         340 :                 s->abm |= 1ULL << (s->cp_sequence[i] % 64);
     497             : 
     498          60 :         s->abm_popcount = popcount64(s->abm);
     499          60 : }
     500             : 
     501             : static inline double
     502          13 : jarowinkler_lp(const str_item *a, const str_item *b)
     503             : {
     504          13 :         unsigned int l;
     505             : 
     506             :         /* calculate common string prefix up to prefixlen chars */
     507          13 :         l = 0;
     508          65 :         for (int i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); i++)
     509          52 :                 l += (a->cp_sequence[i] == b->cp_sequence[i]);
     510             : 
     511          13 :         return (double) l *JARO_WINKLER_SCALING_FACTOR;
     512             : }
     513             : 
     514             : static inline double
     515          15 : jarowinkler(const str_item *x, const str_item *y, double lp, int *x_flags,
     516             :                         int *y_flags)
     517             : {
     518          15 :         int xlen = x->len, ylen = y->len;
     519          15 :         int range = MAX(0, MAX(xlen, ylen) / 2 - 1);
     520          15 :         int *x1 = x->cp_sequence, *x2 = y->cp_sequence;
     521          15 :         int m = 0, t = 0;
     522          15 :         int i, j, l;
     523          15 :         double dw;
     524             : 
     525          15 :         if (!xlen || !ylen)
     526             :                 return 0.0;
     527          89 :         for (i = 0; i < xlen; i++)
     528          74 :                 x_flags[i] = 0;
     529         102 :         for (i = 0; i < ylen; i++)
     530          87 :                 y_flags[i] = 0;
     531             :         /* matching chars */
     532         102 :         for (i = 0; i < ylen; i++) {
     533         280 :                 for (j = MAX(i - range, 0), l = MIN(i + range + 1, xlen); j < l; j++) {
     534         225 :                         if (x2[i] == x1[j] && !x_flags[j]) {
     535          32 :                                 x_flags[j] = 1;
     536          32 :                                 y_flags[i] = 1;
     537          32 :                                 m++;
     538          32 :                                 break;
     539             :                         }
     540             :                 }
     541             :         }
     542          15 :         if (!m)
     543             :                 return 0.0;
     544             :         /* char transpositions */
     545             :         l = 0;
     546          90 :         for (i = 0; i < ylen; i++) {
     547          77 :                 if (y_flags[i] == 1) {
     548          48 :                         for (j = l; j < xlen; j++) {
     549          48 :                                 if (x_flags[j] == 1) {
     550          32 :                                         l = j + 1;
     551          32 :                                         break;
     552             :                                 }
     553             :                         }
     554          32 :                         if (x2[i] != x1[j])
     555           4 :                                 t++;
     556             :                 }
     557             :         }
     558          13 :         t /= 2;
     559             : 
     560             :         /* jaro similarity */
     561          13 :         dw = (((double) m / xlen) + ((double) m / ylen) +
     562          13 :                   ((double) (m - t) / m)) / 3.0;
     563             :         /* calculate common string prefix up to prefixlen chars */
     564          13 :         if (lp == -1)
     565          13 :                 lp = jarowinkler_lp(x, y);
     566             :         /* jaro-winkler similarity */
     567          13 :         dw = dw + (lp * (1 - dw));
     568          13 :         return dw;
     569             : }
     570             : 
     571             : static str
     572           2 : TXTSIMjarowinkler(dbl *res, str *x, str *y)
     573             : {
     574           2 :         int *x_flags = NULL, *y_flags = NULL;
     575           2 :         str_item xi = { 0 }, yi = { 0 };
     576           2 :         str msg = MAL_SUCCEED;
     577             : 
     578           2 :         xi.val = *x;
     579           2 :         xi.len = UTF8_strlen(*x);
     580           2 :         if ((msg = str_2_codepointseq(&xi)) != MAL_SUCCEED)
     581           0 :                 goto bailout;
     582             : 
     583           2 :         yi.val = *y;
     584           2 :         yi.len = UTF8_strlen(*y);
     585           2 :         if ((msg = str_2_codepointseq(&yi)) != MAL_SUCCEED)
     586           0 :                 goto bailout;
     587             : 
     588           2 :         x_flags = GDKmalloc(xi.len * sizeof(int));
     589           2 :         y_flags = GDKmalloc(yi.len * sizeof(int));
     590             : 
     591           2 :         if (x_flags == NULL || y_flags == NULL)
     592           0 :                 goto bailout;
     593             : 
     594           2 :         *res = jarowinkler(&xi, &yi, -1, x_flags, y_flags);
     595             : 
     596           2 :   bailout:
     597           2 :         GDKfree(x_flags);
     598           2 :         GDKfree(y_flags);
     599           2 :         GDKfree(xi.cp_sequence);
     600           2 :         GDKfree(yi.cp_sequence);
     601           2 :         return msg;
     602             : }
     603             : 
     604             : static str
     605           0 : TXTSIMminjarowinkler(bit *res, str *x, str *y, const dbl *threshold)
     606             : {
     607           0 :         str msg = MAL_SUCCEED;
     608           0 :         double s = 1;
     609             : 
     610           0 :         msg = TXTSIMjarowinkler(&s, x, y);
     611           0 :         if (msg != MAL_SUCCEED)
     612           0 :                 throw(MAL, "txt.minjarowinkler", OPERATION_FAILED);
     613             : 
     614           0 :         *res = (s > *threshold);
     615           0 :         return MAL_SUCCEED;
     616             : }
     617             : 
     618             : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width))
     619             : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
     620             : 
     621             : #define PREP_BAT_STRITEM(B, CI, SI)                                                                     \
     622             :                 do {                                                                                                                    \
     623             :                         for (n = 0; n < CI.ncand; n++) {                                                     \
     624             :                                 SI[n].matches = 0;                                                                              \
     625             :                                 SI[n].o = canditer_next(&CI);                                                       \
     626             :                                 SI[n].val = (str) VALUE(B, SI[n].o - B->hseqbase);           \
     627             :                                 SI[n].cp_sequence = NULL;                                                               \
     628             :                                 SI[n].len = UTF8_strlen(SI[n].val);                                     \
     629             :                                 SI[n].cp_seq_len = str_strlen(SI[n].val);                               \
     630             :                                 if ((msg = str_2_codepointseq(&SI[n])) != MAL_SUCCEED)      \
     631             :                                         goto exit;                                                                                      \
     632             :                                 str_alphabet_bitmap(&SI[n]);                                                        \
     633             :                         }                                                                                                                       \
     634             :                 } while (false)
     635             : 
     636             : #define FINALIZE_BATS(L, R, LCI, RCI, LSI, RSI) \
     637             :                 do {                                                                            \
     638             :                         assert(BATcount(L) == BATcount(R));     \
     639             :                         BATsetcount(L, BATcount(L));                    \
     640             :                         BATsetcount(R, BATcount(R));                    \
     641             :                         for (n = 0; n < LCI.ncand; n++) {            \
     642             :                                 if (LSI[n].matches > 1) {                    \
     643             :                                         L->tkey = false;                             \
     644             :                                         break;                                                  \
     645             :                                 }                                                                       \
     646             :                         }                                                                               \
     647             :                         if (n == LCI.ncand) {                                   \
     648             :                                 L->tkey = true;                                      \
     649             :                         }                                                                               \
     650             :                         for (n = 0; n < RCI.ncand; n++) {            \
     651             :                                 if (RSI[n].matches > 1) {                    \
     652             :                                         R->tkey = false;                             \
     653             :                                         break;                                                  \
     654             :                                 }                                                                       \
     655             :                         }                                                                               \
     656             :                         if (n == RCI.ncand) {                                   \
     657             :                                 R->tkey = true;                                      \
     658             :                         }                                                                               \
     659             :                         BATordered(L);                                                  \
     660             :                         BATordered(R);                                                  \
     661             :                         L->theap->dirty |= BATcount(L) > 0;    \
     662             :                         R->theap->dirty |= BATcount(R) > 0;    \
     663             :                 } while (false)
     664             : 
     665             : static inline int
     666          12 : maxlevenshtein_extcol_stritem(const str_item *si1, const str_item *si2,
     667             :                                                           unsigned int *column, const int k)
     668             : {
     669          12 :         unsigned int lastdiag, olddiag;
     670          12 :         int c1, c2, x, y;
     671          12 :         unsigned int min;
     672          12 :         int s1len = si1->len, s2len = si2->len;
     673          12 :         int *s1 = si1->cp_sequence, *s2 = si2->cp_sequence;
     674             :         /* first test if the strings are equal */
     675          12 :         if (s1len == s2len) {
     676           1 :                 for (x = 0; x < s1len; x++)
     677           1 :                         if (s1[x] != s2[x])
     678             :                                 break;
     679           1 :                 if (x == s1len)
     680             :                         return 0;
     681             :         }
     682          68 :         for (y = 1; y <= s1len; y++)
     683          56 :                 column[y] = y;
     684          78 :         for (x = 1; x <= s2len; x++) {
     685          69 :                 c2 = s2[x - 1];
     686          69 :                 column[0] = x;
     687          69 :                 min = INT_MAX;
     688         395 :                 for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
     689         326 :                         olddiag = column[y];
     690         326 :                         c1 = s1[y - 1];
     691         326 :                         column[y] = MIN3(column[y] + 1, column[y - 1] + 1, lastdiag + (c1 == c2 ? 0 : 1));
     692         326 :                         lastdiag = olddiag;
     693         326 :                         if (lastdiag < min)
     694             :                                 min = lastdiag;
     695             :                 }
     696          69 :                 if (min > (unsigned int) k)
     697             :                         return INT_MAX;
     698             :         }
     699           9 :         return column[s1len];
     700             : }
     701             : 
     702             : static str
     703           6 : maxlevenshteinjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int k)
     704             : {
     705           6 :         BAT *r1t = NULL, *r2t = NULL;
     706           6 :         BUN n;
     707           6 :         struct canditer lci, rci;
     708           6 :         const char *lvals, *rvals, *lvars, *rvars;;
     709           6 :         int lwidth, rwidth, d;
     710           6 :         str_item *lsi = NULL, *rsi = NULL;
     711           6 :         str msg = MAL_SUCCEED;
     712           6 :         unsigned int *buffer = NULL;
     713             : 
     714          18 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
     715           6 :         assert(ATOMtype(l->ttype) == TYPE_str);
     716             : 
     717           6 :         canditer_init(&lci, l, sl);
     718           6 :         canditer_init(&rci, r, sr);
     719             : 
     720           6 :         if (lci.ncand == 0 || rci.ncand == 0)
     721           0 :                 goto exit;
     722             : 
     723           6 :         lvals = (const char *) Tloc(l, 0);
     724           6 :         rvals = (const char *) Tloc(r, 0);
     725           6 :         lvars = l->tvheap->base;
     726           6 :         rvars = r->tvheap->base;
     727           6 :         lwidth = l->twidth;
     728           6 :         rwidth = r->twidth;
     729             : 
     730           6 :         if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
     731           6 :                 (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
     732           0 :                 reclaim_bats(2, r1t, r2t);
     733           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     734             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     735           0 :                 goto exit;
     736             :         }
     737             : 
     738           6 :         r1t->tsorted = r1t->trevsorted = false;
     739           6 :         r2t->tsorted = r2t->trevsorted = false;
     740             : 
     741           6 :         if ((lsi = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
     742           6 :                 (rsi = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
     743           0 :                 reclaim_bats(2, r1t, r2t);
     744           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     745             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     746           0 :                 goto exit;
     747             :         }
     748             : 
     749          12 :         PREP_BAT_STRITEM(l, lci, lsi);
     750          24 :         PREP_BAT_STRITEM(r, rci, rsi);
     751           6 :         qsort(lsi, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     752           6 :         qsort(rsi, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     753             : 
     754           6 :         if ((buffer = GDKmalloc((lsi[0].len + 1) * sizeof(int))) == NULL) {
     755           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     756             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     757           0 :                 goto exit;
     758             :         }
     759             : 
     760             :         /* join loop */
     761          24 :         for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
     762          34 :                 for (n = lstart; n < lci.ncand; n++) {
     763             :                         /* first and cheapest filter */
     764          18 :                         if ((lsi[n].len) > k + rsi[rstart].len) {
     765           0 :                                 lstart++;
     766           0 :                                 continue;               /* no possible matches yet for this r */
     767          18 :                         } else if (rsi[rstart].len > k + lsi[n].len) {
     768             :                                 break;                  /* no more possible matches from this r */
     769             :                         }
     770             :                         /* filter by comparing alphabet bitmaps (imprecise but fast filter) */
     771          16 :                         d = MAX(lsi[n].abm_popcount,
     772             :                                         rsi[rstart].abm_popcount) -
     773          16 :                                         popcount64(lsi[n].abm & rsi[rstart].abm);
     774          16 :                         if (d > k)
     775           4 :                                 continue;
     776             :                         /* final and most expensive test: Levenshtein distance */
     777          12 :                         d = maxlevenshtein_extcol_stritem(&lsi[n], &rsi[rstart], buffer,
     778             :                                                                                           (const unsigned int) k);
     779          12 :                         if (d > k)
     780           6 :                                 continue;
     781             :                         /* The match test succeeded */
     782           6 :                         lsi[n].matches++;
     783           6 :                         rsi[rstart].matches++;
     784           6 :                         if (bunfastappTYPE(oid, r1t, &(lsi[n].o)) != GDK_SUCCEED) {
     785           0 :                                 reclaim_bats(2, r1t, r2t);
     786           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     787             :                                                                           OPERATION_FAILED "Failed bun append");
     788           0 :                                 goto exit;
     789             :                         }
     790           6 :                         if (bunfastappTYPE(oid, r2t, &(rsi[rstart].o)) != GDK_SUCCEED) {
     791           0 :                                 reclaim_bats(2, r1t, r2t);
     792           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     793             :                                                                           OPERATION_FAILED "Failed bun append");
     794           0 :                                 goto exit;
     795             :                         }
     796             :                 }
     797             :         }
     798             : 
     799          29 :         FINALIZE_BATS(r1t, r2t, lci, rci, lsi, rsi);
     800           6 :         *r1 = r1t;
     801           6 :         *r2 = r2t;
     802             : 
     803           6 :   exit:
     804          12 :         for (n = 0; lsi && n < lci.ncand; n++)
     805           6 :                 GDKfree(lsi[n].cp_sequence);
     806          24 :         for (n = 0; rsi && n < rci.ncand; n++)
     807          18 :                 GDKfree(rsi[n].cp_sequence);
     808           6 :         GDKfree(lsi);
     809           6 :         GDKfree(rsi);
     810           6 :         GDKfree(buffer);
     811           6 :         return msg;
     812             : }
     813             : 
     814             : static str
     815           6 : TXTSIMmaxlevenshteinjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
     816             :                                                  const bat *kid, const bat *slid, const bat *srid,
     817             :                                                  const bit *nil_matches, const lng *estimate,
     818             :                                                  const bit *anti)
     819             : {
     820           6 :         (void) nil_matches;
     821           6 :         (void) estimate;
     822           6 :         (void) anti;
     823             : 
     824           6 :         BAT *bleft = NULL, *bright = NULL, *bk = NULL,
     825           6 :                         *bcandleft = NULL, *bcandright = NULL, *r1t = NULL, *r2t = NULL;
     826           6 :         int k = 0;
     827           6 :         str msg = MAL_SUCCEED;
     828             : 
     829           6 :         if ((bleft = BATdescriptor(*lid)) == NULL ||
     830           6 :                 (bright = BATdescriptor(*rid)) == NULL ||
     831           6 :                 (bk = BATdescriptor(*kid)) == NULL) {
     832           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     833             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     834           0 :                 goto exit;
     835             :         }
     836             : 
     837           6 :         if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
     838           6 :                 (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
     839           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     840             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     841           0 :                 goto exit;
     842             :         }
     843             : 
     844           6 :         if (BATcount(bk) > 0) {
     845           6 :                 BATiter ki = bat_iterator(bk);
     846           6 :                 k = *(int *) BUNtloc(ki, 0);
     847           6 :                 bat_iterator_end(&ki);
     848             :         }
     849             : 
     850           6 :         if ((msg = maxlevenshteinjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, k)) != MAL_SUCCEED)
     851           0 :                 goto exit;
     852             : 
     853           6 :         *r1 = r1t->batCacheid;
     854           6 :         *r2 = r2t->batCacheid;
     855           6 :         BBPkeepref(r1t);
     856           6 :         BBPkeepref(r2t);
     857             : 
     858           6 :   exit:
     859           6 :         reclaim_bats(5, bleft, bright, bcandleft, bcandright, bk);
     860           6 :         if (msg != MAL_SUCCEED)
     861           0 :                 reclaim_bats(2, r1t, r2t);
     862           6 :         return msg;
     863             : }
     864             : 
     865             : static inline void
     866           9 : jarowinkler_rangebounds(int *lb, int *ub, const str_item *a, const double lp,
     867             :                                                 const double threshold)
     868             : {
     869           9 :         *lb = (int) floor(3.0 * a->len * (threshold - lp) / (1.0 - lp) -
     870           9 :                                           (2.0 * a->len));
     871           9 :         *ub = (int) ceil(a->len / ((3.0 * (threshold - lp) / (1.0 - lp)) - 2.0));
     872           9 : }
     873             : 
     874             : /* version with given lp and m, and t = 0*/
     875             : static inline double
     876          19 : jarowinkler_lp_m_t0(const str_item *lsi, const str_item *rsi, double lp, int m)
     877             : {
     878          19 :         double dw;
     879             :         /* Jaro similarity */
     880          19 :         dw = (((double) m / lsi->len) + ((double) m / rsi->len) + 1.0) / 3.0;
     881             :         /* Jaro-Winkler similarity */
     882          19 :         dw = dw + (lp * (1 - dw));
     883          19 :         return dw;
     884             : }
     885             : 
     886             : static str
     887           9 : minjarowinklerjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
     888             :                                    const dbl threshold)
     889             : {
     890           9 :         BAT *r1t = NULL, *r2t = NULL;
     891           9 :         BUN n;
     892           9 :         struct canditer lci, rci;
     893           9 :         const char *lvals, *rvals, *lvars, *rvars;
     894           9 :         int lwidth, rwidth, lb = 0, ub = 0, m = -1, *x_flags = NULL, *y_flags = NULL;
     895           9 :         str_item *ssl = NULL, *ssr = NULL, shortest;
     896           9 :         str msg = MAL_SUCCEED;
     897           9 :         const bool sliding_window_allowed = threshold > (2.01 + JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR) / 3.0;
     898           9 :         double s, lp = 0;
     899             : 
     900          27 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
     901           9 :         assert(ATOMtype(l->ttype) == TYPE_str);
     902             : 
     903           9 :         canditer_init(&lci, l, sl);
     904           9 :         canditer_init(&rci, r, sr);
     905             : 
     906           9 :         if (lci.ncand == 0 || rci.ncand == 0)
     907           0 :                 goto exit;
     908             : 
     909           9 :         lvals = (const char *) Tloc(l, 0);
     910           9 :         rvals = (const char *) Tloc(r, 0);
     911           9 :         assert(r->ttype);
     912           9 :         lvars = l->tvheap->base;
     913           9 :         rvars = r->tvheap->base;
     914           9 :         lwidth = l->twidth;
     915           9 :         rwidth = r->twidth;
     916             : 
     917           9 :         if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
     918           9 :                 (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
     919           0 :                 reclaim_bats(2, r1t, r2t);
     920           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
     921             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     922           0 :                 goto exit;
     923             :         }
     924             : 
     925           9 :         r1t->tsorted = r1t->trevsorted = false;
     926           9 :         r2t->tsorted = r2t->trevsorted = false;
     927             : 
     928           9 :         if ((ssl = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
     929           9 :                 (ssr = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
     930           0 :                 reclaim_bats(2, r1t, r2t);
     931           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     932             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     933           0 :                 goto exit;
     934             :         }
     935             : 
     936          18 :         PREP_BAT_STRITEM(l, lci, ssl);
     937          36 :         PREP_BAT_STRITEM(r, rci, ssr);
     938           9 :         qsort(ssl, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     939           9 :         qsort(ssr, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     940             : 
     941           9 :         if ((x_flags = GDKmalloc(ssl[0].len * sizeof(int))) == NULL ||
     942           9 :                 (y_flags = GDKmalloc(ssr[0].len * sizeof(int))) == NULL) {
     943           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
     944             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     945           0 :                 goto exit;
     946             :         }
     947             :         // lp used for filters. Use -1 for actual JW (forces to compute actual lp)
     948          36 :         lp = JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR;
     949             : 
     950             :         /* join loop */
     951          36 :         for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
     952          27 :                 if (sliding_window_allowed)
     953           9 :                         jarowinkler_rangebounds(&lb, &ub, &ssr[rstart], lp, threshold);
     954          47 :                 for (n = lstart; n < lci.ncand; n++) {
     955             :                         /* Update sliding window */
     956             :                         /* This is the first and cheapest filter */
     957          27 :                         if (sliding_window_allowed) {
     958           9 :                                 if (ssl[n].len > ub) {       /* no possible matches yet for this r */
     959           1 :                                         lstart++;
     960           1 :                                         continue;
     961             :                                 }
     962           8 :                                 if (ssl[n].len < lb) {       /* no more possible matches from this r */
     963             :                                         break;
     964             :                                 }
     965             :                         }
     966             :                         /* filter by comparing alphabet bitmaps */
     967             :                         /* find the best possible m: the length of the shorter string
     968             :                          * minus the number of characters that surely cannot match */
     969          19 :                         shortest = ssl[n].len < ssr[rstart].len ? ssl[n] : ssr[rstart];
     970          19 :                         m = shortest.len - popcount64(shortest.abm -
     971          19 :                                                                                   (ssl[n].abm & ssr[rstart].abm));
     972             :                         /* equivalent to:
     973             :                            m = shortest.len - popcount64(shortest.abm & (~(ssl[n].abm & ssr[rstart].abm))); */
     974          19 :                         s = jarowinkler_lp_m_t0(&ssl[n], &ssr[rstart], lp, m);
     975          19 :                         if (s < threshold) {
     976           6 :                                 continue;
     977             :                         }
     978             :                         /* final and most expensive test: Jaro-Winkler similarity */
     979          13 :                         s = jarowinkler(&ssl[n], &ssr[rstart], -1, x_flags, y_flags);
     980          13 :                         if (s < threshold) {
     981           4 :                                 continue;
     982             :                         }
     983             :                         /* The match test succeeded */
     984           9 :                         ssl[n].matches++;
     985           9 :                         ssr[rstart].matches++;
     986           9 :                         if (bunfastappTYPE(oid, r1t, &(ssl[n].o)) != GDK_SUCCEED) {
     987           0 :                                 reclaim_bats(2, r1t, r2t);
     988           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     989             :                                                                           OPERATION_FAILED "Failed bun append");
     990           0 :                                 goto exit;
     991             :                         }
     992           9 :                         if (bunfastappTYPE(oid, r2t, &(ssr[rstart].o)) != GDK_SUCCEED) {
     993           0 :                                 reclaim_bats(2, r1t, r2t);
     994           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     995             :                                                                           OPERATION_FAILED "Failed bun append");
     996           0 :                                 goto exit;
     997             :                         }
     998             :                 }
     999             :         }
    1000             : 
    1001          42 :         FINALIZE_BATS(r1t, r2t, lci, rci, ssl, ssr);
    1002           9 :         *r1 = r1t;
    1003           9 :         *r2 = r2t;
    1004             : 
    1005           9 :   exit:
    1006           9 :         if (ssl)
    1007          18 :                 for (n = 0; n < lci.ncand; n++)
    1008           9 :                         GDKfree(ssl[n].cp_sequence);
    1009           9 :         if (ssr)
    1010          36 :                 for (n = 0; n < rci.ncand; n++)
    1011          27 :                         GDKfree(ssr[n].cp_sequence);
    1012           9 :         GDKfree(x_flags);
    1013           9 :         GDKfree(y_flags);
    1014           9 :         GDKfree(ssl);
    1015           9 :         GDKfree(ssr);
    1016           9 :         return msg;
    1017             : }
    1018             : 
    1019             : static str
    1020           9 : TXTSIMminjarowinklerjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
    1021             :                                                  const bat *thresholdid, const bat *slid,
    1022             :                                                  const bat *srid, const bit *nil_matches,
    1023             :                                                  const lng *estimate, const bit *anti)
    1024             : {
    1025           9 :         (void) nil_matches;
    1026           9 :         (void) estimate;
    1027           9 :         (void) anti;
    1028             : 
    1029           9 :         BAT *bleft = NULL, *bright = NULL,
    1030           9 :                         *bcandleft = NULL, *bcandright = NULL, *bthreshold = NULL,
    1031           9 :                         *r1t = NULL, *r2t = NULL;
    1032           9 :         dbl threshold = 1;
    1033           9 :         str msg = MAL_SUCCEED;
    1034             : 
    1035           9 :         if ((bleft = BATdescriptor(*lid)) == NULL ||
    1036           9 :                 (bright = BATdescriptor(*rid)) == NULL ||
    1037           9 :                 (bthreshold = BATdescriptor(*thresholdid)) == NULL) {
    1038           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
    1039             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1040           0 :                 goto exit;
    1041             :         }
    1042             : 
    1043           9 :         if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
    1044           9 :                 (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
    1045           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
    1046             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1047           0 :                 goto exit;
    1048             :         }
    1049             : 
    1050           9 :         if (BATcount(bthreshold) > 0) {
    1051           9 :                 BATiter thresholdi = bat_iterator(bthreshold);
    1052           9 :                 threshold = *(dbl *) BUNtail(thresholdi, 0);
    1053           9 :                 bat_iterator_end(&thresholdi);
    1054             :         }
    1055             : 
    1056           9 :         if ((msg = minjarowinklerjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, threshold)) != MAL_SUCCEED)
    1057           0 :                 goto exit;
    1058             : 
    1059           9 :         *r1 = r1t->batCacheid;
    1060           9 :         *r2 = r2t->batCacheid;
    1061           9 :         BBPkeepref(r1t);
    1062           9 :         BBPkeepref(r2t);
    1063             : 
    1064           9 :   exit:
    1065           9 :         reclaim_bats(5, bleft, bright, bcandleft, bcandright, bthreshold);
    1066           9 :         if (msg != MAL_SUCCEED)
    1067           0 :                 reclaim_bats(2, r1t, r2t);
    1068           9 :         return msg;
    1069             : }
    1070             : 
    1071             : #define SoundexLen 4                    /* length of a soundex code */
    1072             : #define SoundexKey "Z000"             /* default key for soundex code */
    1073             : 
    1074             : /* set letter values */
    1075             : static const int Code[] = { 0, 1, 2, 3, 0, 1,
    1076             :         2, 0, 0, 2, 2, 4,
    1077             :         5, 5, 0, 1, 2, 6,
    1078             :         2, 3, 0, 1, 0, 2, 0, 2
    1079             : };
    1080             : 
    1081             : #define RETURN_NIL_IF(b,t)                                                                                              \
    1082             :         if (b) {                                                                                                                        \
    1083             :                 if (ATOMextern(t)) {                                                                                    \
    1084             :                         *(ptr*) res = (ptr) ATOMnil(t);                                                 \
    1085             :                         if ( *(ptr *) res == NULL)                                                                      \
    1086             :                                 throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
    1087             :                 } else {                                                                                                                \
    1088             :                         memcpy(res, ATOMnilptr(t), ATOMsize(t));                                        \
    1089             :                 }                                                                                                                               \
    1090             :                 return MAL_SUCCEED;                                                                                     \
    1091             :         }
    1092             : 
    1093             : static inline char
    1094         151 : SCode(unsigned char c)
    1095             : {
    1096         151 :         if (c == 95)
    1097             :                 return (2);                             /* german sz */
    1098         151 :         return (Code[toupper(c) - 'A']);
    1099             : }
    1100             : 
    1101             : static str
    1102          67 : soundex_code(const char *Name, char *Key)
    1103             : {
    1104          67 :         char LastLetter;
    1105          67 :         int Index;
    1106             : 
    1107         303 :         for (const char *p = Name; *p; p++)
    1108         237 :                 if ((*p & 0x80) != 0)
    1109           1 :                         throw(MAL, "soundex",
    1110             :                                   SQLSTATE(42000)
    1111             :                                   "Soundex function not available for non ASCII strings");
    1112             : 
    1113             :         /* set default key */
    1114          66 :         strcpy(Key, SoundexKey);
    1115             : 
    1116             :         /* keep first letter */
    1117          66 :         Key[0] = *Name;
    1118          66 :         if (!isupper((unsigned char) (Key[0])))
    1119          23 :                 Key[0] = toupper(Key[0]);
    1120             : 
    1121          66 :         LastLetter = *Name;
    1122          66 :         if (!*Name)
    1123             :                 return MAL_SUCCEED;
    1124          56 :         Name++;
    1125             : 
    1126             :         /* scan rest of string */
    1127         222 :         for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
    1128             :                 /* use only letters */
    1129         166 :                 if (isalpha((unsigned char) (*Name))) {
    1130             :                         /* ignore duplicate successive chars */
    1131         163 :                         if (LastLetter != *Name) {
    1132             :                                 /* new LastLetter */
    1133         151 :                                 LastLetter = *Name;
    1134             : 
    1135             :                                 /* ignore letters with code 0 */
    1136         151 :                                 if (SCode(*Name) != 0) {
    1137          72 :                                         Key[Index] = '0' + SCode(*Name);
    1138          72 :                                         Index ++;
    1139             :                                 }
    1140             :                         }
    1141             :                 }
    1142             :         }
    1143             :         return MAL_SUCCEED;
    1144             : }
    1145             : 
    1146             : static str
    1147          71 : soundex(str *res, str *Name)
    1148             : {
    1149          71 :         str msg = MAL_SUCCEED;
    1150             : 
    1151          71 :         GDKfree(*res);
    1152          75 :         RETURN_NIL_IF(strNil(*Name), TYPE_str);
    1153             : 
    1154          67 :         *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
    1155          67 :         if (*res == NULL)
    1156           0 :                 throw(MAL, "soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1157             : 
    1158             :         /* calculate Key for Name */
    1159          67 :         if ((msg = soundex_code(*Name, *res))) {
    1160           1 :                 GDKfree(*res);
    1161           1 :                 *res = NULL;
    1162           1 :                 return msg;
    1163             :         }
    1164             :         return msg;
    1165             : }
    1166             : 
    1167             : static str
    1168          30 : stringdiff(int *res, str *s1, str *s2)
    1169             : {
    1170          30 :         str r = MAL_SUCCEED;
    1171          30 :         char *S1 = NULL, *S2 = NULL;
    1172             : 
    1173          30 :         r = soundex(&S1, s1);
    1174          30 :         if (r != MAL_SUCCEED)
    1175             :                 return r;
    1176          30 :         r = soundex(&S2, s2);
    1177          30 :         if (r != MAL_SUCCEED) {
    1178           0 :                 GDKfree(S1);
    1179           0 :                 return r;
    1180             :         }
    1181          30 :         r = TXTSIMdameraulevenshtein1(res, &S1, &S2);
    1182          30 :         GDKfree(S1);
    1183          30 :         GDKfree(S2);
    1184          30 :         return r;
    1185             : }
    1186             : 
    1187             : /******************************
    1188             :  * QGRAMNORMALIZE
    1189             :  *
    1190             :  * This function 'normalizes' a string so valid q-grams can  be made of it:
    1191             :  * All characters are transformed to uppercase, and all characters
    1192             :  * which are not letters or digits are stripped to a single space.
    1193             :  *
    1194             :  * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
    1195             :  * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
    1196             :  *
    1197             :  *****************************/
    1198             : static str
    1199          13 : qgram_normalize(str *res, str *Input)
    1200             : {
    1201          13 :         char *input = *Input;
    1202          13 :         int i, j = 0;
    1203          13 :         char c, last = ' ';
    1204             : 
    1205          13 :         GDKfree(*res);
    1206          13 :         RETURN_NIL_IF(strNil(input), TYPE_str);
    1207          13 :         *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1));     /* normalized strings are never longer than original */
    1208          13 :         if (*res == NULL)
    1209           0 :                 throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1210             : 
    1211         100 :         for (i = 0; input[i]; i++) {
    1212          87 :                 c = toupper(input[i]);
    1213          87 :                 if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
    1214          87 :                         c = ' ';
    1215          87 :                 if (c != ' ' || last != ' ') {
    1216          78 :                         (*res)[j++] = c;
    1217             :                 }
    1218          87 :                 last = c;
    1219             :         }
    1220          13 :         (*res)[j] = 0;
    1221             :         /* strip final whitespace */
    1222          16 :         while (j > 0 && (*res)[--j] == ' ')
    1223           3 :                 (*res)[j] = 0;
    1224             : 
    1225             :         return MAL_SUCCEED;
    1226             : }
    1227             : 
    1228             : static str
    1229           0 : qgram_selfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid,
    1230             :                            flt *c, int *k)
    1231             : {
    1232           0 :         BAT *qgram, *id, *pos, *len;
    1233           0 :         BUN n;
    1234           0 :         BUN i, j;
    1235           0 :         BAT *bn, *bn2;
    1236           0 :         oid *qbuf;
    1237           0 :         int *ibuf;
    1238           0 :         int *pbuf;
    1239           0 :         int *lbuf;
    1240           0 :         str msg = MAL_SUCCEED;
    1241             : 
    1242           0 :         qgram = BATdescriptor(*qid);
    1243           0 :         id = BATdescriptor(*bid);
    1244           0 :         pos = BATdescriptor(*pid);
    1245           0 :         len = BATdescriptor(*lid);
    1246           0 :         if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
    1247           0 :                 BBPreclaim(qgram);
    1248           0 :                 BBPreclaim(id);
    1249           0 :                 BBPreclaim(pos);
    1250           0 :                 BBPreclaim(len);
    1251           0 :                 throw(MAL, "txtsim.qgramselfjoin",
    1252             :                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1253             :         }
    1254             : 
    1255           0 :         BATiter qgrami = bat_iterator(qgram);
    1256           0 :         BATiter idi = bat_iterator(id);
    1257           0 :         BATiter posi = bat_iterator(pos);
    1258           0 :         BATiter leni = bat_iterator(len);
    1259           0 :         if (qgrami.type != TYPE_oid)
    1260           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1261             :                                                           SEMANTIC_TYPE_MISMATCH
    1262             :                                                           ": tail of BAT qgram must be oid");
    1263           0 :         else if (idi.type != TYPE_int)
    1264           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1265             :                                                           SEMANTIC_TYPE_MISMATCH
    1266             :                                                           ": tail of BAT id must be int");
    1267           0 :         else if (posi.type != TYPE_int)
    1268           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1269             :                                                           SEMANTIC_TYPE_MISMATCH
    1270             :                                                           ": tail of BAT pos must be int");
    1271           0 :         else if (leni.type != TYPE_int)
    1272           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1273             :                                                           SEMANTIC_TYPE_MISMATCH
    1274             :                                                           ": tail of BAT len must be int");
    1275           0 :         if (msg) {
    1276           0 :                 bat_iterator_end(&qgrami);
    1277           0 :                 bat_iterator_end(&idi);
    1278           0 :                 bat_iterator_end(&posi);
    1279           0 :                 bat_iterator_end(&leni);
    1280           0 :                 BBPunfix(qgram->batCacheid);
    1281           0 :                 BBPunfix(id->batCacheid);
    1282           0 :                 BBPunfix(pos->batCacheid);
    1283           0 :                 BBPunfix(len->batCacheid);
    1284           0 :                 return msg;
    1285             :         }
    1286             : 
    1287           0 :         n = BATcount(qgram);
    1288             : 
    1289             :         /* if (BATcount(qgram)>1 && !qgrami.sorted) throw(MAL, "txtsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
    1290             : 
    1291           0 :         if (!ALIGNsynced(qgram, id))
    1292           0 :                  msg = createException(MAL, "txtsim.qgramselfjoin",
    1293             :                                                            SEMANTIC_TYPE_MISMATCH
    1294             :                                                            ": qgram and id are not synced");
    1295             : 
    1296           0 :         else if (!ALIGNsynced(qgram, pos))
    1297           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1298             :                                                           SEMANTIC_TYPE_MISMATCH
    1299             :                                                           ": qgram and pos are not synced");
    1300           0 :         else if (!ALIGNsynced(qgram, len))
    1301           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1302             :                                                           SEMANTIC_TYPE_MISMATCH
    1303             :                                                           ": qgram and len are not synced");
    1304             : 
    1305           0 :         else if (qgrami.width != ATOMsize(qgrami.type))
    1306           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1307             :                                                           SEMANTIC_TYPE_MISMATCH
    1308             :                                                           ": qgram is not a true void bat");
    1309           0 :         else if (idi.width != ATOMsize(idi.type))
    1310           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1311             :                                                           SEMANTIC_TYPE_MISMATCH
    1312             :                                                           ": id is not a true void bat");
    1313             : 
    1314           0 :         else if (posi.width != ATOMsize(posi.type))
    1315           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1316             :                                                           SEMANTIC_TYPE_MISMATCH
    1317             :                                                           ": pos is not a true void bat");
    1318           0 :         else if (leni.width != ATOMsize(leni.type))
    1319           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1320             :                                                           SEMANTIC_TYPE_MISMATCH
    1321             :                                                           ": len is not a true void bat");
    1322           0 :         if (msg) {
    1323           0 :                 bat_iterator_end(&qgrami);
    1324           0 :                 bat_iterator_end(&idi);
    1325           0 :                 bat_iterator_end(&posi);
    1326           0 :                 bat_iterator_end(&leni);
    1327           0 :                 BBPunfix(qgram->batCacheid);
    1328           0 :                 BBPunfix(id->batCacheid);
    1329           0 :                 BBPunfix(pos->batCacheid);
    1330           0 :                 BBPunfix(len->batCacheid);
    1331           0 :                 return msg;
    1332             :         }
    1333             : 
    1334           0 :         bn = COLnew(0, TYPE_int, n, TRANSIENT);
    1335           0 :         bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
    1336           0 :         if (bn == NULL || bn2 == NULL) {
    1337           0 :                 bat_iterator_end(&qgrami);
    1338           0 :                 bat_iterator_end(&idi);
    1339           0 :                 bat_iterator_end(&posi);
    1340           0 :                 bat_iterator_end(&leni);
    1341           0 :                 BBPreclaim(bn);
    1342           0 :                 BBPreclaim(bn2);
    1343           0 :                 BBPunfix(qgram->batCacheid);
    1344           0 :                 BBPunfix(id->batCacheid);
    1345           0 :                 BBPunfix(pos->batCacheid);
    1346           0 :                 BBPunfix(len->batCacheid);
    1347           0 :                 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1348             :         }
    1349             : 
    1350           0 :         qbuf = (oid *) qgrami.base;
    1351           0 :         ibuf = (int *) idi.base;
    1352           0 :         pbuf = (int *) posi.base;
    1353           0 :         lbuf = (int *) leni.base;
    1354           0 :         for (i = 0; i < n - 1; i++) {
    1355           0 :                 for (j = i + 1;
    1356           0 :                          (j < n && qbuf[j] == qbuf[i]
    1357           0 :                           && pbuf[j] <= (pbuf[i] + (*k + *c * MIN(lbuf[i], lbuf[j]))));
    1358           0 :                          j++) {
    1359           0 :                         if (ibuf[i] != ibuf[j]
    1360           0 :                                 && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MIN(lbuf[i], lbuf[j]))) {
    1361           0 :                                 if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED
    1362           0 :                                         || BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
    1363           0 :                                         bat_iterator_end(&qgrami);
    1364           0 :                                         bat_iterator_end(&idi);
    1365           0 :                                         bat_iterator_end(&posi);
    1366           0 :                                         bat_iterator_end(&leni);
    1367           0 :                                         BBPunfix(qgram->batCacheid);
    1368           0 :                                         BBPunfix(id->batCacheid);
    1369           0 :                                         BBPunfix(pos->batCacheid);
    1370           0 :                                         BBPunfix(len->batCacheid);
    1371           0 :                                         BBPreclaim(bn);
    1372           0 :                                         BBPreclaim(bn2);
    1373           0 :                                         throw(MAL, "txtsim.qgramselfjoin",
    1374             :                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1375             :                                 }
    1376             :                         }
    1377             :                 }
    1378             :         }
    1379           0 :         bat_iterator_end(&qgrami);
    1380           0 :         bat_iterator_end(&idi);
    1381           0 :         bat_iterator_end(&posi);
    1382           0 :         bat_iterator_end(&leni);
    1383             : 
    1384           0 :         BBPunfix(qgram->batCacheid);
    1385           0 :         BBPunfix(id->batCacheid);
    1386           0 :         BBPunfix(pos->batCacheid);
    1387           0 :         BBPunfix(len->batCacheid);
    1388             : 
    1389           0 :         *res1 = bn->batCacheid;
    1390           0 :         BBPkeepref(bn);
    1391           0 :         *res2 = bn2->batCacheid;
    1392           0 :         BBPkeepref(bn2);
    1393             : 
    1394           0 :         return MAL_SUCCEED;
    1395             : }
    1396             : 
    1397             : /* copy up to utf8len UTF-8 encoded characters from src to buf
    1398             :  * stop early if buf (size given by bufsize) is too small, or if src runs out
    1399             :  * return number of UTF-8 characters copied (excluding NUL)
    1400             :  * close with NUL if enough space */
    1401             : static size_t
    1402          26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
    1403             : {
    1404          26 :         size_t cnt = 0;
    1405             : 
    1406         128 :         while (utf8len != 0 && *src != 0 && bufsize != 0) {
    1407         102 :                 bufsize--;
    1408         102 :                 utf8len--;
    1409         102 :                 cnt++;
    1410         102 :                 if (((*buf++ = *src++) & 0x80) != 0) {
    1411          40 :                         while ((*src & 0xC0) == 0x80 && bufsize != 0) {
    1412          20 :                                 *buf++ = *src++;
    1413          20 :                                 bufsize--;
    1414             :                         }
    1415             :                 }
    1416             :         }
    1417          26 :         if (bufsize != 0)
    1418          26 :                 *buf = 0;
    1419          26 :         return cnt;
    1420             : }
    1421             : 
    1422             : static str
    1423           2 : str_2_qgrams(bat *ret, str *val)
    1424             : {
    1425           2 :         BAT *bn;
    1426           2 :         size_t i, len = strlen(*val) + 5;
    1427           2 :         str s = GDKmalloc(len);
    1428           2 :         char qgram[4 * 6 + 1];          /* 4 UTF-8 code points plus NULL byte */
    1429             : 
    1430           2 :         if (s == NULL)
    1431           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1432           2 :         strcpy(s, "##");
    1433           2 :         strcpy(s + 2, *val);
    1434           2 :         strcpy(s + len - 3, "$$");
    1435           2 :         bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
    1436           2 :         if (bn == NULL) {
    1437           0 :                 GDKfree(s);
    1438           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1439             :         }
    1440             : 
    1441             :         i = 0;
    1442          26 :         while (s[i]) {
    1443          26 :                 if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
    1444             :                         break;
    1445          24 :                 if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
    1446           0 :                         BBPreclaim(bn);
    1447           0 :                         GDKfree(s);
    1448           0 :                         throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1449             :                 }
    1450          24 :                 if ((s[i++] & 0xC0) == 0xC0) {
    1451           8 :                         while ((s[i] & 0xC0) == 0x80)
    1452           4 :                                 i++;
    1453             :                 }
    1454             :         }
    1455           2 :         *ret = bn->batCacheid;
    1456           2 :         BBPkeepref(bn);
    1457           2 :         GDKfree(s);
    1458           2 :         return MAL_SUCCEED;
    1459             : }
    1460             : 
    1461             : #include "mel.h"
    1462             : mel_func txtsim_init_funcs[] = {
    1463             :         pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1, transposition = 2)", args(1,3,arg("",int),arg("x",str),arg("y",str))),
    1464             :         pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
    1465             :         command("txtsim", "editdistance", TXTSIMdameraulevenshtein1, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1466             :         command("txtsim", "editdistance2", TXTSIMdameraulevenshtein2, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 1", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1467             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1)", args(1,3,arg("",int),arg("s",str),arg("t",str))),
    1468             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, variable operation costs (ins/del, replacement)", args(1,5,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int))),
    1469             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
    1470             :         pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with basic costs but up to a MAX", args(1, 4, arg("",int), arg("l",str),arg("r",str),arg("k",int))),
    1471             :         pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with variable costs but up to a MAX", args(1, 6, arg("",int), arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
    1472             :         pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 4, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int))),
    1473             :         pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 6, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
    1474             :         command("txtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin, false, "", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("k",int),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    1475             :         command("txtsim", "soundex", soundex, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
    1476             :         command("txtsim", "stringdiff", stringdiff, false, "Calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
    1477             :         command("txtsim", "qgramnormalize", qgram_normalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
    1478             :         command("txtsim", "qgramselfjoin", qgram_selfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
    1479             :         command("txtsim", "str2qgrams", str_2_qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
    1480             :         command("txtsim", "jarowinkler", TXTSIMjarowinkler, false, "Calculate Jaro Winkler similarity", args(1,3, arg("",dbl),arg("x",str),arg("y",str))),
    1481             :         command("txtsim", "minjarowinkler", TXTSIMminjarowinkler, false, "", args(1, 4, arg("",bit), arg("l",str),arg("r",str),arg("threshold",dbl))),
    1482             :         command("txtsim", "minjarowinklerjoin", TXTSIMminjarowinklerjoin, false, "", args(2, 10, batarg("",oid),batarg("",oid), batarg("l",str),batarg("r",str),batarg("threshold",dbl),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    1483             :         { .imp=NULL }
    1484             : };
    1485             : #include "mal_import.h"
    1486             : #ifdef _MSC_VER
    1487             : #undef read
    1488             : #pragma section(".CRT$XCU",read)
    1489             : #endif
    1490         329 : LIB_STARTUP_FUNC(init_txtsim_mal)
    1491         329 : { mal_module("txtsim", NULL, txtsim_init_funcs); }

Generated by: LCOV version 1.14