LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - txtsim.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 533 832 64.1 %
Date: 2024-04-26 00:35:57 Functions: 27 32 84.4 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : #include "monetdb_config.h"
      14             : #include "gdk.h"
      15             : #include "str.h"
      16             : #include "mal.h"
      17             : #include "mal_exception.h"
      18             : #include "mal_interpreter.h"
      19             : #include <string.h>
      20             : #include <limits.h>
      21             : 
      22             : #define MIN3(X,Y,Z)   (MIN(MIN((X),(Y)),(Z)))
      23             : #define MAX3(X,Y,Z)   (MAX(MAX((X),(Y)),(Z)))
      24             : #define MIN4(W,X,Y,Z) (MIN(MIN(MIN(W,X),Y),Z))
      25             : 
      26             : static void
      27          15 : reclaim_bats(int nargs, ...)
      28             : {
      29          15 :         va_list valist;
      30             : 
      31          15 :         va_start(valist, nargs);
      32          90 :         for (int i = 0; i < nargs; i++) {
      33          75 :                 BAT *b = va_arg(valist, BAT *);
      34         120 :                 BBPreclaim(b);
      35             :         }
      36          15 :         va_end(valist);
      37          15 : }
      38             : 
      39             : static inline int *
      40        2143 : damerau_get_cellpointer(int *pOrigin, int col, int row, int nCols)
      41             : {
      42        2143 :         return pOrigin + col + (row * (nCols + 1));
      43             : }
      44             : 
      45             : static inline int
      46        1134 : damerau_getat(int *pOrigin, int col, int row, int nCols)
      47             : {
      48        1134 :         int *pCell;
      49        1134 :         pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
      50        1134 :         return *pCell;
      51             : }
      52             : 
      53             : static inline void
      54        1009 : damerau_putat(int *pOrigin, int col, int row, int nCols, int x)
      55             : {
      56        1009 :         int *pCell;
      57        1009 :         pCell = damerau_get_cellpointer(pOrigin, col, row, nCols);
      58        1009 :         *pCell = x;
      59             : }
      60             : 
      61             : static str
      62          55 : dameraulevenshtein(int *res, str *S, str *T, int insdel_cost, int replace_cost,
      63             :                                    int transpose_cost)
      64             : {
      65          55 :         char *s = *S;
      66          55 :         char *t = *T;
      67          55 :         int *d;                                         /* pointer to matrix */
      68          55 :         int n;                                          /* length of s */
      69          55 :         int m;                                          /* length of t */
      70          55 :         int i;                                          /* iterates through s */
      71          55 :         int j;                                          /* iterates through t */
      72          55 :         char s_i;                                       /* ith character of s */
      73          55 :         char t_j;                                       /* jth character of t */
      74          55 :         int cost;                                       /* cost */
      75          55 :         int cell;                                       /* contents of target cell */
      76          55 :         int above;                                      /* contents of cell immediately above */
      77          55 :         int left;                                       /* contents of cell immediately to left */
      78          55 :         int diag;                                       /* contents of cell immediately above and to left */
      79          55 :         lng sz;                                 /* number of cells in matrix */
      80          55 :         int diag2 = 0, cost2 = 0;
      81             : 
      82         104 :         if (strNil(*S) || strNil(*T)) {
      83           9 :                 *res = int_nil;
      84           9 :                 return MAL_SUCCEED;
      85             :         }
      86             : 
      87             :         /* Step 1 */
      88          46 :         n = (int) strlen(s);            /* 64bit: assume strings are less than 2 GB */
      89          46 :         m = (int) strlen(t);
      90          46 :         if (n == 0) {
      91           8 :                 *res = m;
      92           8 :                 return MAL_SUCCEED;
      93             :         }
      94          38 :         if (m == 0) {
      95           8 :                 *res = n;
      96           8 :                 return MAL_SUCCEED;
      97             :         }
      98          30 :         sz = (((lng)n) + 1) * (((lng)m) + 1) * sizeof(int);
      99          30 :         if (sz > (LL_CONSTANT(1)<<28))
     100           0 :                 throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     101          30 :         d = (int *) GDKmalloc((size_t)sz);
     102          31 :         if (d == NULL)
     103           0 :                 throw(MAL, "dameraulevenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     104             : 
     105             :         /* Step 2 */
     106         214 :         for (i = 0; i <= n; i++) {
     107         183 :                 damerau_putat(d, i, 0, n, i);
     108             :         }
     109         187 :         for (j = 0; j <= m; j++) {
     110         156 :                 damerau_putat(d, 0, j, n, j);
     111             :         }
     112             :         /* Step 3 */
     113         175 :         for (i = 1; i <= n; i++) {
     114         144 :                 s_i = s[i - 1];
     115             :                 /* Step 4 */
     116         814 :                 for (j = 1; j <= m; j++) {
     117         670 :                         t_j = t[j - 1];
     118             :                         /* Step 5 */
     119         670 :                         if (s_i == t_j) {
     120             :                                 cost = 0;
     121             :                         } else {
     122         546 :                                 cost = replace_cost;
     123             :                         }
     124             :                         /* Step 6 */
     125         670 :                         above = damerau_getat(d, i - 1, j, n);
     126         670 :                         left = damerau_getat(d, i, j - 1, n);
     127         670 :                         diag = damerau_getat(d, i - 1, j - 1, n);
     128         670 :                         if (j >= 2 && i >= 2) {
     129             :                                 /* NEW: detect transpositions */
     130         433 :                                 diag2 = damerau_getat(d, i - 2, j - 2, n);
     131         433 :                                 if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
     132             :                                         cost2 = transpose_cost;
     133             :                                 } else {
     134         412 :                                         cost2 = 2;
     135             :                                 }
     136         433 :                                 cell = MIN4(above + insdel_cost, left + insdel_cost,
     137             :                                                         diag + cost, diag2 + cost2);
     138             :                         } else {
     139         237 :                                 cell = MIN3(above + insdel_cost, left + insdel_cost,
     140             :                                                         diag + cost);
     141             :                         }
     142         670 :                         damerau_putat(d, i, j, n, cell);
     143             :                 }
     144             :         }
     145             :         /* Step 7 */
     146          31 :         *res = damerau_getat(d, n, m, n);
     147          31 :         GDKfree(d);
     148          31 :         return MAL_SUCCEED;
     149             : }
     150             : 
     151             : static str
     152          37 : TXTSIMdameraulevenshtein1(int *result, str *s, str *t)
     153             : {
     154           8 :         return dameraulevenshtein(result, s, t, 1, 1, 2);
     155             : }
     156             : 
     157             : static str
     158           9 : TXTSIMdameraulevenshtein2(int *result, str *s, str *t)
     159             : {
     160           9 :         return dameraulevenshtein(result, s, t, 1, 1, 1);
     161             : }
     162             : 
     163             : static str
     164           2 : TXTSIMdameraulevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
     165             :                                                  InstrPtr pci)
     166             : {
     167           2 :         (void) cntxt;
     168           2 :         (void) mb;
     169           2 :         int *res = getArgReference_int(stk, pci, 0);
     170           2 :         str *X = getArgReference_str(stk, pci, 1),
     171           2 :                 *Y = getArgReference_str(stk, pci, 2);
     172           2 :         int insdel_cost, replace_cost, transpose_cost;
     173             : 
     174           2 :         assert(pci->argc == 3 || pci->argc == 6);
     175             : 
     176           2 :         if (pci->argc == 3) {
     177             :                 insdel_cost = 1;
     178             :                 replace_cost = 1;
     179             :                 transpose_cost = 2;
     180             :         } else {
     181           1 :                 insdel_cost = *getArgReference_int(stk, pci, 3);
     182           1 :                 replace_cost = *getArgReference_int(stk, pci, 4);
     183           1 :                 transpose_cost = *getArgReference_int(stk, pci, 5);
     184             :         }
     185             : 
     186           2 :         return dameraulevenshtein(res, X, Y, insdel_cost, replace_cost,
     187             :                                                           transpose_cost);
     188             : }
     189             : 
     190             : static inline str
     191          12 : levenshtein(int *res, const str *X, const str *Y, int insdel_cost,
     192             :                         int replace_cost, int max)
     193             : {
     194          12 :         str x = *X, y = *Y, x_iter = x;
     195          12 :         unsigned int xlen, ylen, i = 0, j = 0;
     196          12 :         unsigned int last_diagonal, old_diagonal;
     197          12 :         int cx, cy;
     198          12 :         unsigned int *column, min;
     199             : 
     200          22 :         if (strNil(*X) || strNil(*Y)) {
     201           3 :                 *res = int_nil;
     202           3 :                 return MAL_SUCCEED;
     203             :         }
     204             : 
     205           9 :         xlen = UTF8_strlen(x);
     206           9 :         ylen = UTF8_strlen(y);
     207             : 
     208           9 :         if (xlen == ylen && (strcmp(x, y) == 0))
     209             :                 return MAL_SUCCEED;
     210             : 
     211           8 :         column = GDKmalloc((xlen + 1) * sizeof(unsigned int));
     212           8 :         if (column == NULL)
     213           0 :                 throw(MAL, "levenshtein", MAL_MALLOC_FAIL);
     214             : 
     215          59 :         for (i = 1; i <= xlen; i++)
     216          51 :                 column[i] = i;
     217             : 
     218          57 :         for (j = 1; j <= ylen; j++) {
     219          49 :                 column[0] = j;
     220          49 :                 min = INT_MAX;
     221          49 :                 x_iter = x;
     222          49 :                 UTF8_GETCHAR(cy, y);
     223         306 :                 for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
     224         257 :                         UTF8_GETCHAR(cx, x_iter);
     225         257 :                         old_diagonal = column[i];
     226         257 :                         column[i] = MIN3(column[i] + insdel_cost,
     227             :                                                          column[i - 1] + insdel_cost,
     228             :                                                          last_diagonal + (cx == cy ? 0 : replace_cost));
     229         257 :                         last_diagonal = old_diagonal;
     230         257 :                         if (last_diagonal < min)
     231             :                                 min = last_diagonal;
     232             :                 }
     233          49 :                 if (max != -1 && min > (unsigned int) max) {
     234           0 :                         *res = INT_MAX;
     235           0 :                         GDKfree(column);
     236           0 :                         return MAL_SUCCEED;
     237             :                 }
     238          49 :                 (*x)++;
     239             :         }
     240             : 
     241           8 :         *res = column[xlen];
     242           8 :         GDKfree(column);
     243           8 :         return MAL_SUCCEED;
     244           0 :   illegal:
     245             :         /* UTF8_GETCHAR bail */
     246           0 :         GDKfree(column);
     247           0 :         throw(MAL, "txtsim.levenshtein", "Illegal unicode code point");
     248             : }
     249             : 
     250             : /* Levenshtein OP but with column externaly allocated */
     251             : static inline int
     252           0 : levenshtein2(const str X, const str Y, const size_t xlen, const size_t ylen,
     253             :                          unsigned int *column, const int insdel_cost,
     254             :                          const int replace_cost, const int max)
     255             : {
     256           0 :         str x = X, y = Y;
     257           0 :         str x_iter = x;
     258           0 :         unsigned int i = 0, j = 0, min;;
     259           0 :         unsigned int last_diagonal, old_diagonal;
     260           0 :         int cx, cy;
     261             : 
     262           0 :         if (strNil(x) || strNil(y))
     263           0 :                 return int_nil;
     264             : 
     265           0 :         if (xlen == ylen && (strcmp(x, y) == 0))
     266             :                 return 0;
     267             : 
     268           0 :         for (i = 1; i <= xlen; i++)
     269           0 :                 column[i] = i;
     270             : 
     271           0 :         for (j = 1; j <= ylen; j++) {
     272           0 :                 column[0] = j;
     273           0 :                 min = INT_MAX;
     274           0 :                 x_iter = x;
     275           0 :                 UTF8_GETCHAR(cy, y);
     276           0 :                 for (i = 1, last_diagonal = j - 1; i <= xlen; i++) {
     277           0 :                         UTF8_GETCHAR(cx, x_iter);
     278           0 :                         old_diagonal = column[i];
     279           0 :                         column[i] = MIN3(column[i] + insdel_cost,
     280             :                                                          column[i - 1] + insdel_cost,
     281             :                                                          last_diagonal + (cx == cy ? 0 : replace_cost));
     282           0 :                         last_diagonal = old_diagonal;
     283           0 :                         if (last_diagonal < min)
     284             :                                 min = last_diagonal;
     285             :                 }
     286           0 :                 if (max != -1 && min > (unsigned int) max)
     287             :                         return INT_MAX;
     288           0 :                 (*x)++;
     289             :         }
     290           0 :         return column[xlen];
     291             :   illegal:
     292             :         /* UTF8_GETCHAR bail */
     293             :         return INT_MAX;
     294             : }
     295             : 
     296             : static str
     297          18 : TXTSIMlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     298             : {
     299          18 :         (void) cntxt;
     300          18 :         (void) mb;
     301          18 :         int *res = getArgReference_int(stk, pci, 0);
     302          18 :         str *X = getArgReference_str(stk, pci, 1),
     303          18 :                 *Y = getArgReference_str(stk, pci, 2);
     304          18 :         int insdel_cost, replace_cost;
     305             : 
     306          18 :         if (pci->argc == 3) {
     307             :                 insdel_cost = 1;
     308             :                 replace_cost = 1;
     309           7 :         } else if (pci->argc == 5 || pci->argc == 6) {
     310           7 :                 insdel_cost = *getArgReference_int(stk, pci, 3);
     311           7 :                 replace_cost = *getArgReference_int(stk, pci, 4);
     312             :                 /* Backwards compatibility purposes */
     313           7 :                 if (pci->argc == 6) {
     314           6 :                         int transposition_cost = *getArgReference_int(stk, pci, 5);
     315           6 :                         return dameraulevenshtein(res, X, Y, insdel_cost, replace_cost,
     316             :                                                                           transposition_cost);
     317             :                 }
     318             :         } else {
     319           0 :                 throw(MAL, "txtsim.levenshtein", RUNTIME_SIGNATURE_MISSING);
     320             :         }
     321             : 
     322          12 :         return levenshtein(res, X, Y, insdel_cost, replace_cost, -1);;
     323             : }
     324             : 
     325             : static str
     326           0 : TXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     327             : {
     328           0 :         (void) cntxt;
     329           0 :         (void) mb;
     330           0 :         int *res = getArgReference_int(stk, pci, 0);
     331           0 :         const str *X = getArgReference_str(stk, pci, 1),
     332           0 :                 *Y = getArgReference_str(stk, pci, 2);
     333           0 :         const int *k = getArgReference_int(stk, pci, 3);
     334           0 :         int insdel_cost, replace_cost;
     335             : 
     336           0 :         if (pci->argc == 4) {
     337             :                 insdel_cost = 1;
     338             :                 replace_cost = 1;
     339           0 :         } else if (pci->argc == 6) {
     340           0 :                 insdel_cost = *getArgReference_int(stk, pci, 4);
     341           0 :                 replace_cost = *getArgReference_int(stk, pci, 5);
     342             :         } else {
     343           0 :                 throw(MAL, "txtsim.maxlevenshtein", RUNTIME_SIGNATURE_MISSING);
     344             :         }
     345             : 
     346           0 :         return levenshtein(res, X, Y, insdel_cost, replace_cost, *k);
     347             : }
     348             : 
     349             : static str
     350           0 : BATTXTSIMmaxlevenshtein(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     351             : {
     352           0 :         (void) cntxt;
     353           0 :         (void) mb;
     354           0 :         bat *res = getArgReference_bat(stk, pci, 0);
     355           0 :         bat *lid = getArgReference_bat(stk, pci, 1);
     356           0 :         bat *rid = getArgReference_bat(stk, pci, 2);
     357           0 :         int *k = getArgReference_int(stk, pci, 3);
     358           0 :         int insdel_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 4) : 1,
     359           0 :                         replace_cost = pci->argc == 6 ? *getArgReference_int(stk, pci, 5) : 1;
     360           0 :         BAT *left = NULL, *right = NULL, *bn = NULL;
     361           0 :         BUN p, q;
     362           0 :         BATiter li, ri;
     363           0 :         unsigned int *buffer = NULL;
     364           0 :         str lv, rv, msg = MAL_SUCCEED;
     365           0 :         size_t llen = 0, rlen = 0, maxlen = 0;
     366           0 :         int d;
     367           0 :         bit v;
     368             : 
     369           0 :         if ((left = BATdescriptor(*lid)) == NULL) {
     370           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     371             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     372           0 :                 goto exit;
     373             :         }
     374           0 :         if ((right = BATdescriptor(*rid)) == NULL) {
     375           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     376             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     377           0 :                 goto exit;
     378             :         }
     379           0 :         if (BATcount(left) != BATcount(right)) {
     380           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     381             :                                                           "Columns must be aligned");
     382           0 :                 goto exit;
     383             :         }
     384           0 :         if ((bn = COLnew(0, TYPE_bit, BATcount(left), TRANSIENT)) == NULL) {
     385           0 :                 msg = createException(MAL, "battxtsim.maxlevenshtein",
     386             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     387           0 :                 goto exit;
     388             :         }
     389             : 
     390           0 :         li = bat_iterator(left);
     391           0 :         ri = bat_iterator(right);
     392           0 :         BATloop(left, p, q) {
     393           0 :                 lv = (str) BUNtail(li, p);
     394           0 :                 rv = (str) BUNtail(ri, p);
     395           0 :                 llen = UTF8_strlen(lv);
     396           0 :                 rlen = UTF8_strlen(rv);
     397           0 :                 if (abs((int) llen - (int) rlen) > (int) *k)
     398           0 :                         v = false;
     399             :                 else {
     400           0 :                         if (llen > maxlen) {
     401           0 :                                 maxlen = llen;
     402           0 :                                 unsigned int *tmp = GDKrealloc(buffer, (maxlen + 1) * sizeof(unsigned int));
     403           0 :                                 if (tmp == NULL) {
     404           0 :                                         bat_iterator_end(&li);
     405           0 :                                         bat_iterator_end(&ri);
     406           0 :                                         msg = createException(MAL, "battxtsim.maxlevenshtein",
     407             :                                                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
     408           0 :                                         goto exit;
     409             :                                 }
     410             :                                 buffer = tmp;
     411             :                         }
     412           0 :                         d = levenshtein2(lv, rv, llen, rlen, buffer, insdel_cost,
     413             :                                                          replace_cost, (int) *k);
     414           0 :                         v = (bit) (d <= (int) *k);
     415             :                 }
     416           0 :                 if (BUNappend(bn, (const void *) &v, false) != GDK_SUCCEED) {
     417           0 :                         bat_iterator_end(&li);
     418           0 :                         bat_iterator_end(&ri);
     419           0 :                         msg = createException(MAL, "battxtsim.maxlevenshtein",
     420             :                                                                   "BUNappend failed");
     421           0 :                         goto exit;
     422             :                 }
     423             :         }
     424           0 :         bat_iterator_end(&li);
     425           0 :         bat_iterator_end(&ri);
     426             : 
     427           0 :         *res = bn->batCacheid;
     428           0 :         BBPkeepref(bn);
     429           0 :   exit:
     430           0 :         GDKfree(buffer);
     431           0 :         BBPreclaim(left);
     432           0 :         BBPreclaim(right);
     433           0 :         if (msg != MAL_SUCCEED)
     434           0 :                 BBPreclaim(bn);
     435           0 :         return msg;
     436             : }
     437             : 
     438             : #define JARO_WINKLER_SCALING_FACTOR 0.1
     439             : #define JARO_WINKLER_PREFIX_LEN 4
     440             : 
     441             : typedef struct {
     442             :         size_t matches;                         /* accumulator for number of matches for this item */
     443             :         BUN o;                                          /* position in the BAT */
     444             :         str val;                                        /* string value */
     445             :         int *cp_sequence;                       /* string as array of Unicode codepoints */
     446             :         int len;                                        /* string length in characters (multi-byte characters count as 1) */
     447             :         int cp_seq_len;                         /* string length in bytes */
     448             :         uint64_t abm;                           /* 64bit alphabet bitmap */
     449             :         int abm_popcount;                       /* hamming weight of abm */
     450             : } str_item;
     451             : 
     452             : static inline int
     453          95 : popcount64(uint64_t x)
     454             : {
     455             : #if defined(__GNUC__)
     456          95 :         return (int) __builtin_popcountll(x);
     457             : #elif defined(_MSC_VER)
     458             : #if SIZEOF_OID == 4
     459             :         /* no __popcnt64 on 32 bit Windows */
     460             :         return (int) (__popcnt((uint32_t) x) + __popcnt((uint32_t) (x >> 32)));
     461             : #else
     462             :         return (int) __popcnt64(x);
     463             : #endif
     464             : #else
     465             :         x = (x & UINT64_C(0x5555555555555555)) + ((x >> 1) & UINT64_C(0x5555555555555555));
     466             :         x = (x & UINT64_C(0x3333333333333333)) + ((x >> 2) & UINT64_C(0x3333333333333333));
     467             :         x = (x & UINT64_C(0x0F0F0F0F0F0F0F0F)) + ((x >> 4) & UINT64_C(0x0F0F0F0F0F0F0F0F));
     468             :         return (int) ((x * UINT64_C(0x0101010101010101)) >> 56);
     469             : #endif
     470             : }
     471             : 
     472             : static int
     473          39 : str_item_lenrev_cmp(const void *a, const void *b)
     474             : {
     475          39 :         return ((int) ((str_item *) b)->len - (int) ((str_item *) a)->len);
     476             : }
     477             : 
     478             : static str
     479          61 : str_2_codepointseq(str_item *s)
     480             : {
     481          61 :         str p = s->val;
     482          61 :         int c;
     483             : 
     484          61 :         s->cp_sequence = GDKmalloc(s->len * sizeof(int));
     485          63 :         if (s->cp_sequence == NULL)
     486           0 :                 throw(MAL, "str_2_byteseq", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     487             : 
     488         423 :         for (int i = 0; i < s->len; i++) {
     489         360 :                 UTF8_GETCHAR(c, p);
     490         360 :                 if (c == 0)
     491             :                         break;
     492         360 :                 s->cp_sequence[i] = c;
     493             :         }
     494             :         return MAL_SUCCEED;
     495           0 :   illegal:
     496           0 :         throw(MAL, "str_2_byteseq", SQLSTATE(42000) "Illegal unicode code point");
     497             : }
     498             : 
     499             : static void
     500          60 : str_alphabet_bitmap(str_item *s)
     501             : {
     502          60 :         int i;
     503             : 
     504          60 :         s->abm = UINT64_C(0);
     505             : 
     506         400 :         for (i = 0; i < s->len; i++)
     507         340 :                 s->abm |= UINT64_C(1) << (s->cp_sequence[i] % 64);
     508             : 
     509          60 :         s->abm_popcount = popcount64(s->abm);
     510          60 : }
     511             : 
     512             : static inline double
     513          13 : jarowinkler_lp(const str_item *a, const str_item *b)
     514             : {
     515          13 :         unsigned int l;
     516             : 
     517             :         /* calculate common string prefix up to prefixlen chars */
     518          13 :         l = 0;
     519          65 :         for (int i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); i++)
     520          52 :                 l += (a->cp_sequence[i] == b->cp_sequence[i]);
     521             : 
     522          13 :         return (double) l *JARO_WINKLER_SCALING_FACTOR;
     523             : }
     524             : 
     525             : static inline double
     526          15 : jarowinkler(const str_item *x, const str_item *y, double lp, int *x_flags,
     527             :                         int *y_flags)
     528             : {
     529          15 :         int xlen = x->len, ylen = y->len;
     530          15 :         int range = MAX(0, MAX(xlen, ylen) / 2 - 1);
     531          15 :         int *x1 = x->cp_sequence, *x2 = y->cp_sequence;
     532          15 :         int m = 0, t = 0;
     533          15 :         int i, j, l;
     534          15 :         double dw;
     535             : 
     536          15 :         if (!xlen || !ylen)
     537             :                 return 0.0;
     538          89 :         for (i = 0; i < xlen; i++)
     539          74 :                 x_flags[i] = 0;
     540         102 :         for (i = 0; i < ylen; i++)
     541          87 :                 y_flags[i] = 0;
     542             :         /* matching chars */
     543         102 :         for (i = 0; i < ylen; i++) {
     544         280 :                 for (j = MAX(i - range, 0), l = MIN(i + range + 1, xlen); j < l; j++) {
     545         225 :                         if (x2[i] == x1[j] && !x_flags[j]) {
     546          32 :                                 x_flags[j] = 1;
     547          32 :                                 y_flags[i] = 1;
     548          32 :                                 m++;
     549          32 :                                 break;
     550             :                         }
     551             :                 }
     552             :         }
     553          15 :         if (!m)
     554             :                 return 0.0;
     555             :         /* char transpositions */
     556             :         l = 0;
     557          90 :         for (i = 0; i < ylen; i++) {
     558          77 :                 if (y_flags[i] == 1) {
     559          48 :                         for (j = l; j < xlen; j++) {
     560          48 :                                 if (x_flags[j] == 1) {
     561          32 :                                         l = j + 1;
     562          32 :                                         break;
     563             :                                 }
     564             :                         }
     565          32 :                         if (x2[i] != x1[j])
     566           4 :                                 t++;
     567             :                 }
     568             :         }
     569          13 :         t /= 2;
     570             : 
     571             :         /* jaro similarity */
     572          13 :         dw = (((double) m / xlen) + ((double) m / ylen) +
     573          13 :                   ((double) (m - t) / m)) / 3.0;
     574             :         /* calculate common string prefix up to prefixlen chars */
     575          13 :         if (lp == -1)
     576          13 :                 lp = jarowinkler_lp(x, y);
     577             :         /* jaro-winkler similarity */
     578          13 :         dw = dw + (lp * (1 - dw));
     579          13 :         return dw;
     580             : }
     581             : 
     582             : static str
     583           2 : TXTSIMjarowinkler(dbl *res, str *x, str *y)
     584             : {
     585           2 :         int *x_flags = NULL, *y_flags = NULL;
     586           2 :         str_item xi = { 0 }, yi = { 0 };
     587           2 :         str msg = MAL_SUCCEED;
     588             : 
     589           2 :         xi.val = *x;
     590           2 :         xi.len = UTF8_strlen(*x);
     591           2 :         if ((msg = str_2_codepointseq(&xi)) != MAL_SUCCEED)
     592           0 :                 goto bailout;
     593             : 
     594           2 :         yi.val = *y;
     595           2 :         yi.len = UTF8_strlen(*y);
     596           2 :         if ((msg = str_2_codepointseq(&yi)) != MAL_SUCCEED)
     597           0 :                 goto bailout;
     598             : 
     599           2 :         x_flags = GDKmalloc(xi.len * sizeof(int));
     600           2 :         y_flags = GDKmalloc(yi.len * sizeof(int));
     601             : 
     602           2 :         if (x_flags == NULL || y_flags == NULL)
     603           0 :                 goto bailout;
     604             : 
     605           2 :         *res = jarowinkler(&xi, &yi, -1, x_flags, y_flags);
     606             : 
     607           2 :   bailout:
     608           2 :         GDKfree(x_flags);
     609           2 :         GDKfree(y_flags);
     610           2 :         GDKfree(xi.cp_sequence);
     611           2 :         GDKfree(yi.cp_sequence);
     612           2 :         return msg;
     613             : }
     614             : 
     615             : static str
     616           0 : TXTSIMminjarowinkler(bit *res, str *x, str *y, const dbl *threshold)
     617             : {
     618           0 :         str msg = MAL_SUCCEED;
     619           0 :         double s = 1;
     620             : 
     621           0 :         msg = TXTSIMjarowinkler(&s, x, y);
     622           0 :         if (msg != MAL_SUCCEED)
     623           0 :                 throw(MAL, "txt.minjarowinkler", OPERATION_FAILED);
     624             : 
     625           0 :         *res = (s > *threshold);
     626           0 :         return MAL_SUCCEED;
     627             : }
     628             : 
     629             : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width))
     630             : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
     631             : 
     632             : #define PREP_BAT_STRITEM(B, CI, SI)                                                                             \
     633             :                 do {                                                                                                                    \
     634             :                         for (n = 0; n < CI.ncand; n++) {                                                     \
     635             :                                 SI[n].matches = 0;                                                                              \
     636             :                                 SI[n].o = canditer_next(&CI);                                                       \
     637             :                                 SI[n].val = (str) VALUE(B, SI[n].o - B->hseqbase);           \
     638             :                                 SI[n].cp_sequence = NULL;                                                               \
     639             :                                 SI[n].len = UTF8_strlen(SI[n].val);                                             \
     640             :                                 SI[n].cp_seq_len = str_strlen(SI[n].val);                               \
     641             :                                 if ((msg = str_2_codepointseq(&SI[n])) != MAL_SUCCEED)      \
     642             :                                         goto exit;                                                                                      \
     643             :                                 str_alphabet_bitmap(&SI[n]);                                                        \
     644             :                         }                                                                                                                       \
     645             :                 } while (false)
     646             : 
     647             : #define FINALIZE_BATS(L, R, LCI, RCI, LSI, RSI) \
     648             :                 do {                                                                    \
     649             :                         assert(BATcount(L) == BATcount(R));     \
     650             :                         BATsetcount(L, BATcount(L));            \
     651             :                         BATsetcount(R, BATcount(R));            \
     652             :                         for (n = 0; n < LCI.ncand; n++) {    \
     653             :                                 if (LSI[n].matches > 1) {            \
     654             :                                         L->tkey = false;                     \
     655             :                                         break;                                          \
     656             :                                 }                                                               \
     657             :                         }                                                                       \
     658             :                         if (n == LCI.ncand) {                           \
     659             :                                 L->tkey = true;                                      \
     660             :                         }                                                                       \
     661             :                         for (n = 0; n < RCI.ncand; n++) {    \
     662             :                                 if (RSI[n].matches > 1) {            \
     663             :                                         R->tkey = false;                     \
     664             :                                         break;                                          \
     665             :                                 }                                                               \
     666             :                         }                                                                       \
     667             :                         if (n == RCI.ncand) {                           \
     668             :                                 R->tkey = true;                                      \
     669             :                         }                                                                       \
     670             :                         BATordered(L);                                          \
     671             :                         BATordered(R);                                          \
     672             :                         L->theap->dirty |= BATcount(L) > 0;    \
     673             :                         R->theap->dirty |= BATcount(R) > 0;    \
     674             :                 } while (false)
     675             : 
     676             : static inline int
     677          12 : maxlevenshtein_extcol_stritem(const str_item *si1, const str_item *si2,
     678             :                                                           unsigned int *column, const int k)
     679             : {
     680          12 :         unsigned int lastdiag, olddiag;
     681          12 :         int c1, c2, x, y;
     682          12 :         unsigned int min;
     683          12 :         int s1len = si1->len, s2len = si2->len;
     684          12 :         int *s1 = si1->cp_sequence, *s2 = si2->cp_sequence;
     685             :         /* first test if the strings are equal */
     686          12 :         if (s1len == s2len) {
     687           1 :                 for (x = 0; x < s1len; x++)
     688           1 :                         if (s1[x] != s2[x])
     689             :                                 break;
     690           1 :                 if (x == s1len)
     691             :                         return 0;
     692             :         }
     693          68 :         for (y = 1; y <= s1len; y++)
     694          56 :                 column[y] = y;
     695          78 :         for (x = 1; x <= s2len; x++) {
     696          69 :                 c2 = s2[x - 1];
     697          69 :                 column[0] = x;
     698          69 :                 min = INT_MAX;
     699         395 :                 for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
     700         326 :                         olddiag = column[y];
     701         326 :                         c1 = s1[y - 1];
     702         326 :                         column[y] = MIN3(column[y] + 1, column[y - 1] + 1, lastdiag + (c1 == c2 ? 0 : 1));
     703         326 :                         lastdiag = olddiag;
     704         326 :                         if (lastdiag < min)
     705             :                                 min = lastdiag;
     706             :                 }
     707          69 :                 if (min > (unsigned int) k)
     708             :                         return INT_MAX;
     709             :         }
     710           9 :         return column[s1len];
     711             : }
     712             : 
     713             : static str
     714           6 : maxlevenshteinjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int k)
     715             : {
     716           6 :         BAT *r1t = NULL, *r2t = NULL;
     717           6 :         BUN n;
     718           6 :         struct canditer lci, rci;
     719           6 :         const char *lvals, *rvals, *lvars, *rvars;;
     720           6 :         int lwidth, rwidth, d;
     721           6 :         str_item *lsi = NULL, *rsi = NULL;
     722           6 :         str msg = MAL_SUCCEED;
     723           6 :         unsigned int *buffer = NULL;
     724             : 
     725          18 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
     726           6 :         assert(ATOMtype(l->ttype) == TYPE_str);
     727             : 
     728           6 :         canditer_init(&lci, l, sl);
     729           6 :         canditer_init(&rci, r, sr);
     730             : 
     731           6 :         if (lci.ncand == 0 || rci.ncand == 0)
     732           0 :                 goto exit;
     733             : 
     734           6 :         lvals = (const char *) Tloc(l, 0);
     735           6 :         rvals = (const char *) Tloc(r, 0);
     736           6 :         lvars = l->tvheap->base;
     737           6 :         rvars = r->tvheap->base;
     738           6 :         lwidth = l->twidth;
     739           6 :         rwidth = r->twidth;
     740             : 
     741           6 :         if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
     742           6 :                 (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
     743           0 :                 reclaim_bats(2, r1t, r2t);
     744           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     745             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     746           0 :                 goto exit;
     747             :         }
     748             : 
     749           6 :         r1t->tsorted = r1t->trevsorted = false;
     750           6 :         r2t->tsorted = r2t->trevsorted = false;
     751             : 
     752           6 :         if ((lsi = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
     753           6 :                 (rsi = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
     754           0 :                 reclaim_bats(2, r1t, r2t);
     755           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     756             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     757           0 :                 goto exit;
     758             :         }
     759             : 
     760          12 :         PREP_BAT_STRITEM(l, lci, lsi);
     761          24 :         PREP_BAT_STRITEM(r, rci, rsi);
     762           6 :         qsort(lsi, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     763           6 :         qsort(rsi, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     764             : 
     765           6 :         if ((buffer = GDKmalloc((lsi[0].len + 1) * sizeof(int))) == NULL) {
     766           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     767             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     768           0 :                 goto exit;
     769             :         }
     770             : 
     771             :         /* join loop */
     772          24 :         for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
     773          34 :                 for (n = lstart; n < lci.ncand; n++) {
     774             :                         /* first and cheapest filter */
     775          18 :                         if ((lsi[n].len) > k + rsi[rstart].len) {
     776           0 :                                 lstart++;
     777           0 :                                 continue;               /* no possible matches yet for this r */
     778          18 :                         } else if (rsi[rstart].len > k + lsi[n].len) {
     779             :                                 break;                  /* no more possible matches from this r */
     780             :                         }
     781             :                         /* filter by comparing alphabet bitmaps (imprecise but fast filter) */
     782          16 :                         d = MAX(lsi[n].abm_popcount,
     783             :                                         rsi[rstart].abm_popcount) -
     784          16 :                                         popcount64(lsi[n].abm & rsi[rstart].abm);
     785          16 :                         if (d > k)
     786           4 :                                 continue;
     787             :                         /* final and most expensive test: Levenshtein distance */
     788          12 :                         d = maxlevenshtein_extcol_stritem(&lsi[n], &rsi[rstart], buffer,
     789             :                                                                                           (const unsigned int) k);
     790          12 :                         if (d > k)
     791           6 :                                 continue;
     792             :                         /* The match test succeeded */
     793           6 :                         lsi[n].matches++;
     794           6 :                         rsi[rstart].matches++;
     795           6 :                         if (bunfastappTYPE(oid, r1t, &(lsi[n].o)) != GDK_SUCCEED) {
     796           0 :                                 reclaim_bats(2, r1t, r2t);
     797           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     798             :                                                                           OPERATION_FAILED "Failed bun append");
     799           0 :                                 goto exit;
     800             :                         }
     801           6 :                         if (bunfastappTYPE(oid, r2t, &(rsi[rstart].o)) != GDK_SUCCEED) {
     802           0 :                                 reclaim_bats(2, r1t, r2t);
     803           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     804             :                                                                           OPERATION_FAILED "Failed bun append");
     805           0 :                                 goto exit;
     806             :                         }
     807             :                 }
     808             :         }
     809             : 
     810          29 :         FINALIZE_BATS(r1t, r2t, lci, rci, lsi, rsi);
     811           6 :         *r1 = r1t;
     812           6 :         *r2 = r2t;
     813             : 
     814           6 :   exit:
     815          12 :         for (n = 0; lsi && n < lci.ncand; n++)
     816           6 :                 GDKfree(lsi[n].cp_sequence);
     817          24 :         for (n = 0; rsi && n < rci.ncand; n++)
     818          18 :                 GDKfree(rsi[n].cp_sequence);
     819           6 :         GDKfree(lsi);
     820           6 :         GDKfree(rsi);
     821           6 :         GDKfree(buffer);
     822           6 :         return msg;
     823             : }
     824             : 
     825             : static str
     826           5 : TXTSIMmaxlevenshteinjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
     827             :                                                  const bat *kid, const bat *slid, const bat *srid,
     828             :                                                  const bit *nil_matches, const lng *estimate,
     829             :                                                  const bit *anti)
     830             : {
     831           5 :         (void) nil_matches;
     832           5 :         (void) estimate;
     833           5 :         (void) anti;
     834             : 
     835           5 :         BAT *bleft = NULL, *bright = NULL, *bk = NULL,
     836           5 :                         *bcandleft = NULL, *bcandright = NULL, *r1t = NULL, *r2t = NULL;
     837           5 :         int k = 0;
     838           5 :         str msg = MAL_SUCCEED;
     839             : 
     840           5 :         if ((bleft = BATdescriptor(*lid)) == NULL ||
     841           6 :                 (bright = BATdescriptor(*rid)) == NULL ||
     842           6 :                 (bk = BATdescriptor(*kid)) == NULL) {
     843           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     844             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     845           0 :                 goto exit;
     846             :         }
     847             : 
     848           6 :         if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
     849           6 :                 (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
     850           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     851             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     852           0 :                 goto exit;
     853             :         }
     854             : 
     855           6 :         if (BATcount(bk) > 0) {
     856           6 :                 BATiter ki = bat_iterator(bk);
     857           6 :                 k = *(int *) BUNtloc(ki, 0);
     858           6 :                 bat_iterator_end(&ki);
     859             :         }
     860             : 
     861           6 :         if ((msg = maxlevenshteinjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, k)) != MAL_SUCCEED)
     862           0 :                 goto exit;
     863             : 
     864           6 :         *r1 = r1t->batCacheid;
     865           6 :         *r2 = r2t->batCacheid;
     866           6 :         BBPkeepref(r1t);
     867           6 :         BBPkeepref(r2t);
     868             : 
     869           6 :   exit:
     870           6 :         reclaim_bats(5, bleft, bright, bcandleft, bcandright, bk);
     871           6 :         if (msg != MAL_SUCCEED)
     872           0 :                 reclaim_bats(2, r1t, r2t);
     873           6 :         return msg;
     874             : }
     875             : 
     876             : static inline void
     877           7 : jarowinkler_rangebounds(int *lb, int *ub, const str_item *a, const double lp,
     878             :                                                 const double threshold)
     879             : {
     880           7 :         *lb = (int) floor(3.0 * a->len * (threshold - lp) / (1.0 - lp) -
     881           7 :                                           (2.0 * a->len));
     882           7 :         *ub = (int) ceil(a->len / ((3.0 * (threshold - lp) / (1.0 - lp)) - 2.0));
     883           7 : }
     884             : 
     885             : /* version with given lp and m, and t = 0*/
     886             : static inline double
     887          19 : jarowinkler_lp_m_t0(const str_item *lsi, const str_item *rsi, double lp, int m)
     888             : {
     889          19 :         double dw;
     890             :         /* Jaro similarity */
     891          19 :         dw = (((double) m / lsi->len) + ((double) m / rsi->len) + 1.0) / 3.0;
     892             :         /* Jaro-Winkler similarity */
     893          19 :         dw = dw + (lp * (1 - dw));
     894          19 :         return dw;
     895             : }
     896             : 
     897             : static str
     898           9 : minjarowinklerjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
     899             :                                    const dbl threshold)
     900             : {
     901           9 :         BAT *r1t = NULL, *r2t = NULL;
     902           9 :         BUN n;
     903           9 :         struct canditer lci, rci;
     904           9 :         const char *lvals, *rvals, *lvars, *rvars;
     905           9 :         int lwidth, rwidth, lb = 0, ub = 0, m = -1, *x_flags = NULL, *y_flags = NULL;
     906           9 :         str_item *ssl = NULL, *ssr = NULL, shortest;
     907           9 :         str msg = MAL_SUCCEED;
     908           9 :         const bool sliding_window_allowed = threshold > (2.01 + JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR) / 3.0;
     909           9 :         double s, lp = 0;
     910             : 
     911          27 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
     912           9 :         assert(ATOMtype(l->ttype) == TYPE_str);
     913             : 
     914           9 :         canditer_init(&lci, l, sl);
     915           9 :         canditer_init(&rci, r, sr);
     916             : 
     917           9 :         if (lci.ncand == 0 || rci.ncand == 0)
     918           0 :                 goto exit;
     919             : 
     920           9 :         lvals = (const char *) Tloc(l, 0);
     921           9 :         rvals = (const char *) Tloc(r, 0);
     922           9 :         assert(r->ttype);
     923           9 :         lvars = l->tvheap->base;
     924           9 :         rvars = r->tvheap->base;
     925           9 :         lwidth = l->twidth;
     926           9 :         rwidth = r->twidth;
     927             : 
     928           9 :         if ((r1t = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT)) == NULL ||
     929           9 :                 (r2t = COLnew(0, TYPE_oid, rci.ncand, TRANSIENT)) == NULL) {
     930           0 :                 reclaim_bats(2, r1t, r2t);
     931           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
     932             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     933           0 :                 goto exit;
     934             :         }
     935             : 
     936           9 :         r1t->tsorted = r1t->trevsorted = false;
     937           9 :         r2t->tsorted = r2t->trevsorted = false;
     938             : 
     939           9 :         if ((ssl = GDKmalloc(lci.ncand * sizeof(str_item))) == NULL ||
     940           9 :                 (ssr = GDKmalloc(rci.ncand * sizeof(str_item))) == NULL) {
     941           0 :                 reclaim_bats(2, r1t, r2t);
     942           0 :                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
     943             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     944           0 :                 goto exit;
     945             :         }
     946             : 
     947          16 :         PREP_BAT_STRITEM(l, lci, ssl);
     948          32 :         PREP_BAT_STRITEM(r, rci, ssr);
     949           9 :         qsort(ssl, lci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     950           7 :         qsort(ssr, rci.ncand, sizeof(str_item), str_item_lenrev_cmp);
     951             : 
     952           8 :         if ((x_flags = GDKmalloc(ssl[0].len * sizeof(int))) == NULL ||
     953           9 :                 (y_flags = GDKmalloc(ssr[0].len * sizeof(int))) == NULL) {
     954           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
     955             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     956           0 :                 goto exit;
     957             :         }
     958             :         // lp used for filters. Use -1 for actual JW (forces to compute actual lp)
     959          32 :         lp = JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR;
     960             : 
     961             :         /* join loop */
     962          32 :         for (BUN lstart = 0, rstart = 0; rstart < rci.ncand; rstart++) {
     963          23 :                 if (sliding_window_allowed)
     964           7 :                         jarowinkler_rangebounds(&lb, &ub, &ssr[rstart], lp, threshold);
     965          44 :                 for (n = lstart; n < lci.ncand; n++) {
     966             :                         /* Update sliding window */
     967             :                         /* This is the first and cheapest filter */
     968          25 :                         if (sliding_window_allowed) {
     969           7 :                                 if (ssl[n].len > ub) {       /* no possible matches yet for this r */
     970           1 :                                         lstart++;
     971           1 :                                         continue;
     972             :                                 }
     973           6 :                                 if (ssl[n].len < lb) {       /* no more possible matches from this r */
     974             :                                         break;
     975             :                                 }
     976             :                         }
     977             :                         /* filter by comparing alphabet bitmaps */
     978             :                         /* find the best possible m: the length of the shorter string
     979             :                          * minus the number of characters that surely cannot match */
     980          19 :                         shortest = ssl[n].len < ssr[rstart].len ? ssl[n] : ssr[rstart];
     981          19 :                         m = shortest.len - popcount64(shortest.abm -
     982          19 :                                                                                   (ssl[n].abm & ssr[rstart].abm));
     983             :                         /* equivalent to:
     984             :                            m = shortest.len - popcount64(shortest.abm & (~(ssl[n].abm & ssr[rstart].abm))); */
     985          19 :                         s = jarowinkler_lp_m_t0(&ssl[n], &ssr[rstart], lp, m);
     986          19 :                         if (s < threshold) {
     987           7 :                                 continue;
     988             :                         }
     989             :                         /* final and most expensive test: Jaro-Winkler similarity */
     990          12 :                         s = jarowinkler(&ssl[n], &ssr[rstart], -1, x_flags, y_flags);
     991          13 :                         if (s < threshold) {
     992           4 :                                 continue;
     993             :                         }
     994             :                         /* The match test succeeded */
     995           9 :                         ssl[n].matches++;
     996           9 :                         ssr[rstart].matches++;
     997           9 :                         if (bunfastappTYPE(oid, r1t, &(ssl[n].o)) != GDK_SUCCEED) {
     998           0 :                                 reclaim_bats(2, r1t, r2t);
     999           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
    1000             :                                                                           OPERATION_FAILED "Failed bun append");
    1001           0 :                                 goto exit;
    1002             :                         }
    1003           9 :                         if (bunfastappTYPE(oid, r2t, &(ssr[rstart].o)) != GDK_SUCCEED) {
    1004           0 :                                 reclaim_bats(2, r1t, r2t);
    1005           0 :                                 msg = createException(MAL, "txtsim.maxlevenshteinjoin",
    1006             :                                                                           OPERATION_FAILED "Failed bun append");
    1007           0 :                                 goto exit;
    1008             :                         }
    1009             :                 }
    1010             :         }
    1011             : 
    1012          42 :         FINALIZE_BATS(r1t, r2t, lci, rci, ssl, ssr);
    1013           9 :         *r1 = r1t;
    1014           9 :         *r2 = r2t;
    1015             : 
    1016           9 :   exit:
    1017           9 :         if (ssl)
    1018          18 :                 for (n = 0; n < lci.ncand; n++)
    1019           9 :                         GDKfree(ssl[n].cp_sequence);
    1020           9 :         if (ssr)
    1021          34 :                 for (n = 0; n < rci.ncand; n++)
    1022          25 :                         GDKfree(ssr[n].cp_sequence);
    1023           9 :         GDKfree(x_flags);
    1024           9 :         GDKfree(y_flags);
    1025           9 :         GDKfree(ssl);
    1026           9 :         GDKfree(ssr);
    1027           9 :         return msg;
    1028             : }
    1029             : 
    1030             : static str
    1031           9 : TXTSIMminjarowinklerjoin(bat *r1, bat *r2, const bat *lid, const bat *rid,
    1032             :                                                  const bat *thresholdid, const bat *slid,
    1033             :                                                  const bat *srid, const bit *nil_matches,
    1034             :                                                  const lng *estimate, const bit *anti)
    1035             : {
    1036           9 :         (void) nil_matches;
    1037           9 :         (void) estimate;
    1038           9 :         (void) anti;
    1039             : 
    1040           9 :         BAT *bleft = NULL, *bright = NULL,
    1041           9 :                         *bcandleft = NULL, *bcandright = NULL, *bthreshold = NULL,
    1042           9 :                         *r1t = NULL, *r2t = NULL;
    1043           9 :         dbl threshold = 1;
    1044           9 :         str msg = MAL_SUCCEED;
    1045             : 
    1046           9 :         if ((bleft = BATdescriptor(*lid)) == NULL ||
    1047           9 :                 (bright = BATdescriptor(*rid)) == NULL ||
    1048           9 :                 (bthreshold = BATdescriptor(*thresholdid)) == NULL) {
    1049           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
    1050             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1051           0 :                 goto exit;
    1052             :         }
    1053             : 
    1054           9 :         if ((*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL) ||
    1055           9 :                 (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)) {
    1056           0 :                 msg = createException(MAL, "txtsim.minjarowinklerjoin",
    1057             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1058           0 :                 goto exit;
    1059             :         }
    1060             : 
    1061           9 :         if (BATcount(bthreshold) > 0) {
    1062           9 :                 BATiter thresholdi = bat_iterator(bthreshold);
    1063           9 :                 threshold = *(dbl *) BUNtail(thresholdi, 0);
    1064           9 :                 bat_iterator_end(&thresholdi);
    1065             :         }
    1066             : 
    1067           9 :         if ((msg = minjarowinklerjoin(&r1t, &r2t, bleft, bright, bcandleft, bcandright, threshold)) != MAL_SUCCEED)
    1068           0 :                 goto exit;
    1069             : 
    1070           9 :         *r1 = r1t->batCacheid;
    1071           9 :         *r2 = r2t->batCacheid;
    1072           9 :         BBPkeepref(r1t);
    1073           9 :         BBPkeepref(r2t);
    1074             : 
    1075           9 :   exit:
    1076           9 :         reclaim_bats(5, bleft, bright, bcandleft, bcandright, bthreshold);
    1077           9 :         if (msg != MAL_SUCCEED)
    1078           0 :                 reclaim_bats(2, r1t, r2t);
    1079           9 :         return msg;
    1080             : }
    1081             : 
    1082             : #define SoundexLen 4                    /* length of a soundex code */
    1083             : #define SoundexKey "Z000"             /* default key for soundex code */
    1084             : 
    1085             : /* set letter values */
    1086             : static const int Code[] = { 0, 1, 2, 3, 0, 1,
    1087             :         2, 0, 0, 2, 2, 4,
    1088             :         5, 5, 0, 1, 2, 6,
    1089             :         2, 3, 0, 1, 0, 2, 0, 2
    1090             : };
    1091             : 
    1092             : #define RETURN_NIL_IF(b,t)                                                                                              \
    1093             :         if (b) {                                                                                                                        \
    1094             :                 if (ATOMextern(t)) {                                                                                    \
    1095             :                         *(ptr*) res = (ptr) ATOMnil(t);                                                 \
    1096             :                         if ( *(ptr *) res == NULL)                                                                      \
    1097             :                                 throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
    1098             :                 } else {                                                                                                                \
    1099             :                         memcpy(res, ATOMnilptr(t), ATOMsize(t));                                        \
    1100             :                 }                                                                                                                               \
    1101             :                 return MAL_SUCCEED;                                                                                     \
    1102             :         }
    1103             : 
    1104             : static inline char
    1105         139 : SCode(unsigned char c)
    1106             : {
    1107         139 :         if (c == 95)
    1108             :                 return (2);                             /* german sz */
    1109         139 :         return (Code[toupper(c) - 'A']);
    1110             : }
    1111             : 
    1112             : static str
    1113          66 : soundex_code(const char *Name, char *Key)
    1114             : {
    1115          66 :         char LastLetter;
    1116          66 :         int Index;
    1117             : 
    1118         298 :         for (const char *p = Name; *p; p++)
    1119         233 :                 if ((*p & 0x80) != 0)
    1120           1 :                         throw(MAL, "soundex",
    1121             :                                   SQLSTATE(42000)
    1122             :                                   "Soundex function not available for non ASCII strings");
    1123             : 
    1124             :         /* set default key */
    1125          65 :         strcpy(Key, SoundexKey);
    1126             : 
    1127             :         /* keep first letter */
    1128          65 :         Key[0] = *Name;
    1129          65 :         if (!isupper((unsigned char) (Key[0])))
    1130          23 :                 Key[0] = toupper(Key[0]);
    1131             : 
    1132          65 :         LastLetter = *Name;
    1133          65 :         if (!*Name)
    1134             :                 return MAL_SUCCEED;
    1135          55 :         Name++;
    1136             : 
    1137             :         /* scan rest of string */
    1138         208 :         for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
    1139             :                 /* use only letters */
    1140         153 :                 if (isalpha((unsigned char) (*Name))) {
    1141             :                         /* ignore duplicate successive chars */
    1142         151 :                         if (LastLetter != *Name) {
    1143             :                                 /* new LastLetter */
    1144         139 :                                 LastLetter = *Name;
    1145             : 
    1146             :                                 /* ignore letters with code 0 */
    1147         139 :                                 if (SCode(*Name) != 0) {
    1148          68 :                                         Key[Index] = '0' + SCode(*Name);
    1149          68 :                                         Index ++;
    1150             :                                 }
    1151             :                         }
    1152             :                 }
    1153             :         }
    1154             :         return MAL_SUCCEED;
    1155             : }
    1156             : 
    1157             : static str
    1158          68 : soundex(str *res, str *Name)
    1159             : {
    1160          68 :         str msg = MAL_SUCCEED;
    1161             : 
    1162          68 :         GDKfree(*res);
    1163          72 :         RETURN_NIL_IF(strNil(*Name), TYPE_str);
    1164             : 
    1165          64 :         *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
    1166          66 :         if (*res == NULL)
    1167           0 :                 throw(MAL, "soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1168             : 
    1169             :         /* calculate Key for Name */
    1170          66 :         if ((msg = soundex_code(*Name, *res))) {
    1171           1 :                 GDKfree(*res);
    1172           1 :                 *res = NULL;
    1173           1 :                 return msg;
    1174             :         }
    1175             :         return msg;
    1176             : }
    1177             : 
    1178             : static str
    1179          29 : stringdiff(int *res, str *s1, str *s2)
    1180             : {
    1181          29 :         str r = MAL_SUCCEED;
    1182          29 :         char *S1 = NULL, *S2 = NULL;
    1183             : 
    1184          29 :         r = soundex(&S1, s1);
    1185          28 :         if (r != MAL_SUCCEED)
    1186             :                 return r;
    1187          28 :         r = soundex(&S2, s2);
    1188          29 :         if (r != MAL_SUCCEED) {
    1189           0 :                 GDKfree(S1);
    1190           0 :                 return r;
    1191             :         }
    1192          29 :         r = TXTSIMdameraulevenshtein1(res, &S1, &S2);
    1193          30 :         GDKfree(S1);
    1194          30 :         GDKfree(S2);
    1195          30 :         return r;
    1196             : }
    1197             : 
    1198             : /******************************
    1199             :  * QGRAMNORMALIZE
    1200             :  *
    1201             :  * This function 'normalizes' a string so valid q-grams can  be made of it:
    1202             :  * All characters are transformed to uppercase, and all characters
    1203             :  * which are not letters or digits are stripped to a single space.
    1204             :  *
    1205             :  * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
    1206             :  * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
    1207             :  *
    1208             :  *****************************/
    1209             : static str
    1210          12 : qgram_normalize(str *res, str *Input)
    1211             : {
    1212          12 :         char *input = *Input;
    1213          12 :         int i, j = 0;
    1214          12 :         char c, last = ' ';
    1215             : 
    1216          12 :         GDKfree(*res);
    1217          12 :         RETURN_NIL_IF(strNil(input), TYPE_str);
    1218          12 :         *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1));     /* normalized strings are never longer than original */
    1219          12 :         if (*res == NULL)
    1220           0 :                 throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1221             : 
    1222          69 :         for (i = 0; input[i]; i++) {
    1223          57 :                 c = toupper(input[i]);
    1224          57 :                 if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
    1225          57 :                         c = ' ';
    1226          57 :                 if (c != ' ' || last != ' ') {
    1227          49 :                         (*res)[j++] = c;
    1228             :                 }
    1229          57 :                 last = c;
    1230             :         }
    1231          12 :         (*res)[j] = 0;
    1232             :         /* strip final whitespace */
    1233          15 :         while (j > 0 && (*res)[--j] == ' ')
    1234           3 :                 (*res)[j] = 0;
    1235             : 
    1236             :         return MAL_SUCCEED;
    1237             : }
    1238             : 
    1239             : static str
    1240           0 : qgram_selfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid,
    1241             :                            flt *c, int *k)
    1242             : {
    1243           0 :         BAT *qgram, *id, *pos, *len;
    1244           0 :         BUN n;
    1245           0 :         BUN i, j;
    1246           0 :         BAT *bn, *bn2;
    1247           0 :         oid *qbuf;
    1248           0 :         int *ibuf;
    1249           0 :         int *pbuf;
    1250           0 :         int *lbuf;
    1251           0 :         str msg = MAL_SUCCEED;
    1252             : 
    1253           0 :         qgram = BATdescriptor(*qid);
    1254           0 :         id = BATdescriptor(*bid);
    1255           0 :         pos = BATdescriptor(*pid);
    1256           0 :         len = BATdescriptor(*lid);
    1257           0 :         if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
    1258           0 :                 BBPreclaim(qgram);
    1259           0 :                 BBPreclaim(id);
    1260           0 :                 BBPreclaim(pos);
    1261           0 :                 BBPreclaim(len);
    1262           0 :                 throw(MAL, "txtsim.qgramselfjoin",
    1263             :                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1264             :         }
    1265             : 
    1266           0 :         BATiter qgrami = bat_iterator(qgram);
    1267           0 :         BATiter idi = bat_iterator(id);
    1268           0 :         BATiter posi = bat_iterator(pos);
    1269           0 :         BATiter leni = bat_iterator(len);
    1270           0 :         if (qgrami.type != TYPE_oid)
    1271           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1272             :                                                           SEMANTIC_TYPE_MISMATCH
    1273             :                                                           ": tail of BAT qgram must be oid");
    1274           0 :         else if (idi.type != TYPE_int)
    1275           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1276             :                                                           SEMANTIC_TYPE_MISMATCH
    1277             :                                                           ": tail of BAT id must be int");
    1278           0 :         else if (posi.type != TYPE_int)
    1279           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1280             :                                                           SEMANTIC_TYPE_MISMATCH
    1281             :                                                           ": tail of BAT pos must be int");
    1282           0 :         else if (leni.type != TYPE_int)
    1283           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1284             :                                                           SEMANTIC_TYPE_MISMATCH
    1285             :                                                           ": tail of BAT len must be int");
    1286           0 :         if (msg) {
    1287           0 :                 bat_iterator_end(&qgrami);
    1288           0 :                 bat_iterator_end(&idi);
    1289           0 :                 bat_iterator_end(&posi);
    1290           0 :                 bat_iterator_end(&leni);
    1291           0 :                 BBPunfix(qgram->batCacheid);
    1292           0 :                 BBPunfix(id->batCacheid);
    1293           0 :                 BBPunfix(pos->batCacheid);
    1294           0 :                 BBPunfix(len->batCacheid);
    1295           0 :                 return msg;
    1296             :         }
    1297             : 
    1298           0 :         n = BATcount(qgram);
    1299             : 
    1300             :         /* if (BATcount(qgram)>1 && !qgrami.sorted) throw(MAL, "txtsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
    1301             : 
    1302           0 :         if (!ALIGNsynced(qgram, id))
    1303           0 :                  msg = createException(MAL, "txtsim.qgramselfjoin",
    1304             :                                                            SEMANTIC_TYPE_MISMATCH
    1305             :                                                            ": qgram and id are not synced");
    1306             : 
    1307           0 :         else if (!ALIGNsynced(qgram, pos))
    1308           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1309             :                                                           SEMANTIC_TYPE_MISMATCH
    1310             :                                                           ": qgram and pos are not synced");
    1311           0 :         else if (!ALIGNsynced(qgram, len))
    1312           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1313             :                                                           SEMANTIC_TYPE_MISMATCH
    1314             :                                                           ": qgram and len are not synced");
    1315             : 
    1316           0 :         else if (qgrami.width != ATOMsize(qgrami.type))
    1317           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1318             :                                                           SEMANTIC_TYPE_MISMATCH
    1319             :                                                           ": qgram is not a true void bat");
    1320           0 :         else if (idi.width != ATOMsize(idi.type))
    1321           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1322             :                                                           SEMANTIC_TYPE_MISMATCH
    1323             :                                                           ": id is not a true void bat");
    1324             : 
    1325           0 :         else if (posi.width != ATOMsize(posi.type))
    1326           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1327             :                                                           SEMANTIC_TYPE_MISMATCH
    1328             :                                                           ": pos is not a true void bat");
    1329           0 :         else if (leni.width != ATOMsize(leni.type))
    1330           0 :                 msg = createException(MAL, "txtsim.qgramselfjoin",
    1331             :                                                           SEMANTIC_TYPE_MISMATCH
    1332             :                                                           ": len is not a true void bat");
    1333           0 :         if (msg) {
    1334           0 :                 bat_iterator_end(&qgrami);
    1335           0 :                 bat_iterator_end(&idi);
    1336           0 :                 bat_iterator_end(&posi);
    1337           0 :                 bat_iterator_end(&leni);
    1338           0 :                 BBPunfix(qgram->batCacheid);
    1339           0 :                 BBPunfix(id->batCacheid);
    1340           0 :                 BBPunfix(pos->batCacheid);
    1341           0 :                 BBPunfix(len->batCacheid);
    1342           0 :                 return msg;
    1343             :         }
    1344             : 
    1345           0 :         bn = COLnew(0, TYPE_int, n, TRANSIENT);
    1346           0 :         bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
    1347           0 :         if (bn == NULL || bn2 == NULL) {
    1348           0 :                 bat_iterator_end(&qgrami);
    1349           0 :                 bat_iterator_end(&idi);
    1350           0 :                 bat_iterator_end(&posi);
    1351           0 :                 bat_iterator_end(&leni);
    1352           0 :                 BBPreclaim(bn);
    1353           0 :                 BBPreclaim(bn2);
    1354           0 :                 BBPunfix(qgram->batCacheid);
    1355           0 :                 BBPunfix(id->batCacheid);
    1356           0 :                 BBPunfix(pos->batCacheid);
    1357           0 :                 BBPunfix(len->batCacheid);
    1358           0 :                 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1359             :         }
    1360             : 
    1361           0 :         qbuf = (oid *) qgrami.base;
    1362           0 :         ibuf = (int *) idi.base;
    1363           0 :         pbuf = (int *) posi.base;
    1364           0 :         lbuf = (int *) leni.base;
    1365           0 :         for (i = 0; i < n - 1; i++) {
    1366           0 :                 for (j = i + 1;
    1367           0 :                          (j < n && qbuf[j] == qbuf[i]
    1368           0 :                           && pbuf[j] <= (pbuf[i] + (*k + *c * MIN(lbuf[i], lbuf[j]))));
    1369           0 :                          j++) {
    1370           0 :                         if (ibuf[i] != ibuf[j]
    1371           0 :                                 && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MIN(lbuf[i], lbuf[j]))) {
    1372           0 :                                 if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED
    1373           0 :                                         || BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
    1374           0 :                                         bat_iterator_end(&qgrami);
    1375           0 :                                         bat_iterator_end(&idi);
    1376           0 :                                         bat_iterator_end(&posi);
    1377           0 :                                         bat_iterator_end(&leni);
    1378           0 :                                         BBPunfix(qgram->batCacheid);
    1379           0 :                                         BBPunfix(id->batCacheid);
    1380           0 :                                         BBPunfix(pos->batCacheid);
    1381           0 :                                         BBPunfix(len->batCacheid);
    1382           0 :                                         BBPreclaim(bn);
    1383           0 :                                         BBPreclaim(bn2);
    1384           0 :                                         throw(MAL, "txtsim.qgramselfjoin",
    1385             :                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1386             :                                 }
    1387             :                         }
    1388             :                 }
    1389             :         }
    1390           0 :         bat_iterator_end(&qgrami);
    1391           0 :         bat_iterator_end(&idi);
    1392           0 :         bat_iterator_end(&posi);
    1393           0 :         bat_iterator_end(&leni);
    1394             : 
    1395           0 :         BBPunfix(qgram->batCacheid);
    1396           0 :         BBPunfix(id->batCacheid);
    1397           0 :         BBPunfix(pos->batCacheid);
    1398           0 :         BBPunfix(len->batCacheid);
    1399             : 
    1400           0 :         *res1 = bn->batCacheid;
    1401           0 :         BBPkeepref(bn);
    1402           0 :         *res2 = bn2->batCacheid;
    1403           0 :         BBPkeepref(bn2);
    1404             : 
    1405           0 :         return MAL_SUCCEED;
    1406             : }
    1407             : 
    1408             : /* copy up to utf8len UTF-8 encoded characters from src to buf
    1409             :  * stop early if buf (size given by bufsize) is too small, or if src runs out
    1410             :  * return number of UTF-8 characters copied (excluding NUL)
    1411             :  * close with NUL if enough space */
    1412             : static size_t
    1413          26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
    1414             : {
    1415          26 :         size_t cnt = 0;
    1416             : 
    1417         128 :         while (utf8len != 0 && *src != 0 && bufsize != 0) {
    1418         102 :                 bufsize--;
    1419         102 :                 utf8len--;
    1420         102 :                 cnt++;
    1421         102 :                 if (((*buf++ = *src++) & 0x80) != 0) {
    1422          40 :                         while ((*src & 0xC0) == 0x80 && bufsize != 0) {
    1423          20 :                                 *buf++ = *src++;
    1424          20 :                                 bufsize--;
    1425             :                         }
    1426             :                 }
    1427             :         }
    1428          26 :         if (bufsize != 0)
    1429          26 :                 *buf = 0;
    1430          26 :         return cnt;
    1431             : }
    1432             : 
    1433             : static str
    1434           2 : str_2_qgrams(bat *ret, str *val)
    1435             : {
    1436           2 :         BAT *bn;
    1437           2 :         size_t i, len = strlen(*val) + 5;
    1438           2 :         str s = GDKmalloc(len);
    1439           2 :         char qgram[4 * 6 + 1];          /* 4 UTF-8 code points plus NULL byte */
    1440             : 
    1441           2 :         if (s == NULL)
    1442           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1443           2 :         strcpy(s, "##");
    1444           2 :         strcpy(s + 2, *val);
    1445           2 :         strcpy(s + len - 3, "$$");
    1446           2 :         bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
    1447           2 :         if (bn == NULL) {
    1448           0 :                 GDKfree(s);
    1449           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1450             :         }
    1451             : 
    1452             :         i = 0;
    1453          26 :         while (s[i]) {
    1454          26 :                 if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
    1455             :                         break;
    1456          24 :                 if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
    1457           0 :                         BBPreclaim(bn);
    1458           0 :                         GDKfree(s);
    1459           0 :                         throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1460             :                 }
    1461          24 :                 if ((s[i++] & 0xC0) == 0xC0) {
    1462           8 :                         while ((s[i] & 0xC0) == 0x80)
    1463           4 :                                 i++;
    1464             :                 }
    1465             :         }
    1466           2 :         *ret = bn->batCacheid;
    1467           2 :         BBPkeepref(bn);
    1468           2 :         GDKfree(s);
    1469           2 :         return MAL_SUCCEED;
    1470             : }
    1471             : 
    1472             : #include "mel.h"
    1473             : mel_func txtsim_init_funcs[] = {
    1474             :         pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1, transposition = 2)", args(1,3,arg("",int),arg("x",str),arg("y",str))),
    1475             :         pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein, false, "Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
    1476             :         command("txtsim", "editdistance", TXTSIMdameraulevenshtein1, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1477             :         command("txtsim", "editdistance2", TXTSIMdameraulevenshtein2, false, "Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 1", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1478             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1)", args(1,3,arg("",int),arg("s",str),arg("t",str))),
    1479             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates Levenshtein distance between two strings, variable operation costs (ins/del, replacement)", args(1,5,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int))),
    1480             :         pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
    1481             :         pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with basic costs but up to a MAX", args(1, 4, arg("",int), arg("l",str),arg("r",str),arg("k",int))),
    1482             :         pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "Levenshtein distance with variable costs but up to a MAX", args(1, 6, arg("",int), arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
    1483             :         pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 4, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int))),
    1484             :         pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, "Same as maxlevenshtein but for BATS", args(1, 6, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
    1485             :         command("txtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin, false, "", args(2,10, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("k",int),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    1486             :         command("txtsim", "soundex", soundex, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
    1487             :         command("txtsim", "stringdiff", stringdiff, false, "Calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
    1488             :         command("txtsim", "qgramnormalize", qgram_normalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
    1489             :         command("txtsim", "qgramselfjoin", qgram_selfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
    1490             :         command("txtsim", "str2qgrams", str_2_qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
    1491             :         command("txtsim", "jarowinkler", TXTSIMjarowinkler, false, "Calculate Jaro Winkler similarity", args(1,3, arg("",dbl),arg("x",str),arg("y",str))),
    1492             :         command("txtsim", "minjarowinkler", TXTSIMminjarowinkler, false, "", args(1, 4, arg("",bit), arg("l",str),arg("r",str),arg("threshold",dbl))),
    1493             :         command("txtsim", "minjarowinklerjoin", TXTSIMminjarowinklerjoin, false, "", args(2, 10, batarg("",oid),batarg("",oid), batarg("l",str),batarg("r",str),batarg("threshold",dbl),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    1494             :         { .imp=NULL }
    1495             : };
    1496             : #include "mal_import.h"
    1497             : #ifdef _MSC_VER
    1498             : #undef read
    1499             : #pragma section(".CRT$XCU",read)
    1500             : #endif
    1501         334 : LIB_STARTUP_FUNC(init_txtsim_mal)
    1502         334 : { mal_module("txtsim", NULL, txtsim_init_funcs); }

Generated by: LCOV version 1.14