Mercurial > hg > MonetDB-extend
view regexp/regexp.c @ 33:a8ffdbc388ce
Ported to Jun2020 branch.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Tue, 09 Jun 2020 10:30:35 +0200 (2020-06-09) |
parents | 25cd8af6fa82 |
children | e5d2d0c9b7b3 |
line wrap: on
line source
/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 2013-2018 MonetDB B.V. */ /* monetdb_config.h must be included as the first include file */ #include <monetdb_config.h> /* mal_exception.h actually contains everything we need */ #include <mal_exception.h> /* for the candidate iterator */ #include <gdk_cand.h> /* system include files */ #include <string.h> /* we use the PCRE library to do regular expression matching */ #include <pcre.h> /* __declspec() must be used on Windows, but not on other systems */ #ifndef _MSC_VER /* not Windows */ #define __declspec(x) /* nothing */ #endif /* these eight functions are the only externally visible functions * since they are the only ones that are called from the MAL layer; on * Windows they must be exported, on other systems, declaring them as * extern is enough */ extern __declspec(dllexport) char *regexpmatch(bit *ret, const char **val, const char **pat); extern __declspec(dllexport) char *regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags); extern __declspec(dllexport) char *regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti); extern __declspec(dllexport) char *regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti); extern __declspec(dllexport) char *regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate); extern __declspec(dllexport) char *regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const char **flags, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate); extern __declspec(dllexport) char *regexpmatchbulk(bat *ret, const bat *bid, const char **pat); extern __declspec(dllexport) char *regexpmatchfbulk(bat *ret, const bat *bid, const char **pat, const char **flags); static int parseflags(const char *flags) { int options = PCRE_UTF8; /* MonetDB uses UTF-8 exclusively */ if (flags) { while (*flags) { switch (*flags) { case 'i': /* case insensitive */ options |= PCRE_CASELESS; break; case 'x': /* extended regular expressions */ options |= PCRE_EXTENDED; break; case 'm': /* multiline matching */ options |= PCRE_MULTILINE | PCRE_DOTALL;; break; default: return -1; /* indicate there was an error */ } flags++; } } return options; } static char * do_match(bit *ret, const char *val, const char *pat, const char *flags) { const char *err = NULL; int options; int pos = 0; pcre *re; /* if any of the input values are nil, the result is no match */ if (strNil(val) || strNil(pat) || strNil(flags)) { /* special case for NIL inputs: NILs don't match anything */ *ret = 0; return MAL_SUCCEED; } options = parseflags(flags); if (options == -1) throw(MAL, "regexp.rematch", "bad flag character"); re = pcre_compile(pat, options, &err, &pos, NULL); if (re == NULL) { throw(MAL, "regexp.rematch", "compilation of regular expression (%s) failed at %d with %s", pat, pos, err); } pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0); pcre_free(re); if (pos < PCRE_ERROR_NOMATCH) { throw(MAL, "regexp.rematch", "matching of regular expression (%s) failed with %d", pat, pos); } *ret = pos >= 0; return MAL_SUCCEED; } char * regexpmatch(bit *ret, const char **val, const char **pat) { return do_match(ret, *val, *pat, ""); } char * regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags) { return do_match(ret, *val, *pat, *flags); } static char * do_matchbulk(bat *ret, bat bid, const char *pat, const char *flags) { BAT *b; /* input BAT */ BATiter bi; /* helper to loop through values */ BAT *bn; /* result BAT */ bit *outp; /* pointer through which we add to result */ BUN start, end; /* iteration variables */ const char *err = NULL; /* error message from PCRE library */ int pos = 0; /* error position from PCRE library */ int options; /* PCRE options */ pcre *re; /* compiled regular expression */ pcre_extra *sd; /* studied regular expression */ /* from the BAT ID we need to get the BAT descriptor, making * sure that the data of the BAT is loaded into memory */ if ((b = BATdescriptor(bid)) == NULL) { throw(MAL, "batregexp.rematch", RUNTIME_OBJECT_MISSING); } /* check that the BAT has the expected type: we expect str or * something compatible with str (if we only want str, we need * to compare b->ttype with TYPE_str and not use ATOMstorage). * Note, the MAL interpreter will only call this function with * a str BAT because that is the only interface that is * defined in the MAL file, so this check is superfluous. */ if (ATOMstorage(b->ttype) != TYPE_str) { BBPunfix(b->batCacheid); throw(MAL, "batregexp.rematch", SEMANTIC_TYPE_MISMATCH); } /* if any of the input values are nil, the result is no match */ if (strNil(pat) || strNil(flags)) { /* no matches when the pattern or the flags is NIL * we return an a BAT with all NIL values */ bit f = bit_nil; if ((bn = BATconstant(b->hseqbase, TYPE_bit, &f, BATcount(b), TRANSIENT)) == NULL) throw(MAL, "batregexp.rematch", GDK_EXCEPTION); *ret = bn->batCacheid; BBPkeepref(*ret); BBPunfix(b->batCacheid); return MAL_SUCCEED; } options = parseflags(flags); if (options == -1) { BBPunfix(b->batCacheid); throw(MAL, "batregexp.rematch", "bad flag character"); } /* allocate a result BAT; the capacity we ask for is the size * of the input BAT since we produce a value for each input * value */ bn = COLnew(b->hseqbase, TYPE_bit, BATcount(b), TRANSIENT); if (bn == NULL) { BBPunfix(b->batCacheid); throw(MAL, "batregexp.rematch", GDK_EXCEPTION); } /* Position outp at the start of the result array. * We know the array is large enough even if every value were * to match, so we don't need to check for that. */ outp = (bit *) Tloc(bn, 0); /* compile the regular expression */ re = pcre_compile(pat, options, &err, &pos, NULL); if (re == NULL) { BBPunfix(b->batCacheid); BBPreclaim(bn); throw(MAL, "batregexp.rematch", "compilation of regular expression (%s) failed at %d with %s", pat, pos, err); } /* invest in study of the r.e. */ sd = pcre_study(re, 0, &err); if (err != NULL) { pcre_free(re); BBPunfix(b->batCacheid); BBPreclaim(bn); throw(MAL, "batregexp.rematch", "study of regular expression (%s) failed with %s", pat, err); } /* now, start and end are the limits in b that we need to look * at, and if set, cand and candend are the beginning and end * of the list of OIDs of b that we need to consider */ bi = bat_iterator(b); /* we will change these if we add a NIL */ bn->tnil = false; bn->tnonil = true; for (start = 0, end = BATcount(b); start < end; start++) { const char *val = BUNtvar(bi, start); /* nil values never match */ if (strNil(val)) { *outp++ = bit_nil; bn->tnil = true; bn->tnonil = false; } else { pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); if (pos < 0 && pos != PCRE_ERROR_NOMATCH && pos != PCRE_ERROR_BADUTF8) { /* error during processing */ BBPunfix(b->batCacheid); BBPreclaim(bn); pcre_free_study(sd); pcre_free(re); throw(MAL, "batregexp.rematch", "matching of regular expression (%s) failed with %d", pat, pos); } *outp++ = pos >= 0; /* TRUE if match, FALSE if not */ } } /* set properties and size of result BAT */ BATsetcount(bn, BATcount(b)); if (BATcount(bn) > 1) { /* if more than 1 result, it is not reverse sorted */ bn->tsorted = false; /* probably not sorted */ bn->trevsorted = false; /* probably not reverse sorted */ bn->tkey = false; /* probably not key */ } else { /* if empty or a single result, it is sorted, reverse * sorted, and key */ bn->tsorted = true; bn->trevsorted = true; bn->tkey = true; } bn->tnosorted = 0; /* we don't know for sure */ bn->tnorevsorted = 0; /* we don't know for sure */ bn->tnokey[0] = bn->tnokey[1] = 0; /* we're done with b and re */ BBPunfix(b->batCacheid); pcre_free_study(sd); pcre_free(re); *ret = bn->batCacheid; BBPkeepref(*ret); return MAL_SUCCEED; } char * regexpmatchbulk(bat *ret, const bat *bid, const char **pat) { return do_matchbulk(ret, *bid, *pat, ""); } char * regexpmatchfbulk(bat *ret, const bat *bid, const char **pat, const char **flags) { return do_matchbulk(ret, *bid, *pat, *flags); } static char * do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti) { BAT *b, *s = NULL; /* input BAT and optional candidate list */ struct canditer ci; /* candidate iterator */ BATiter bi; /* helper to loop through values */ BAT *bn; /* result BAT */ oid *outp; /* pointer through which we add to result */ const char *err = NULL; /* error message from PCRE library */ int pos = 0; /* error position from PCRE library */ int options; /* PCRE options */ pcre *re; /* compiled regular expression */ pcre_extra *sd; /* studied regular expression */ /* if any of the input values are nil, the result is no match */ if (strNil(pat) || strNil(flags)) { /* no matches when the pattern or the flags is NIL * we return an empty BAT of the correct type */ if ((bn = BATdense(0, 0, 0)) == NULL) throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); *ret = bn->batCacheid; BBPkeepref(*ret); return MAL_SUCCEED; } options = parseflags(flags); if (options == -1) throw(MAL, "regexp.rematchselect", "bad flag character"); /* from the BAT IDs we need to get the BAT descriptors, making * sure that the data of the BATs are loaded into memory */ if ((b = BATdescriptor(bid)) == NULL) { throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); } /* check that the BAT has the expected type: we expect str or * something compatible with str (if we only want str, we need * to compare b->ttype with TYPE_str and not use ATOMstorage). * Note, the MAL interpreter will only call this function with * a str BAT because that is the only interface that is * defined in the MAL file, so this check is superfluous. */ if (ATOMstorage(b->ttype) != TYPE_str) { BBPunfix(b->batCacheid); throw(MAL, "regexp.rematchselect", SEMANTIC_TYPE_MISMATCH); } if (!is_bat_nil(sid) && (s = BATdescriptor(sid)) == NULL) { BBPunfix(b->batCacheid); throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); } if (canditer_init(&ci, b, s) == 0) { /* trivially empty result */ BBPunfix(b->batCacheid); if (s) BBPunfix(s->batCacheid); bn = BATdense(0, 0, 0); *ret = bn->batCacheid; BBPkeepref(*ret); return MAL_SUCCEED; } /* allocate a result BAT; the capacity we ask for is the * maximum potential result size (i.e. the size of the * candidate list if there is one, else the size of the input * BAT b) */ bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT); if (bn == NULL) { BBPunfix(b->batCacheid); if (s) BBPunfix(s->batCacheid); throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); } /* Position outp at the start of the result array. * We know the array is large enough even if every value were * to match, so we don't need to check for that. */ outp = (oid *) Tloc(bn, 0); /* compile the regular expression */ re = pcre_compile(pat, options, &err, &pos, NULL); if (re == NULL) { BBPunfix(b->batCacheid); if (s) BBPunfix(s->batCacheid); BBPreclaim(bn); throw(MAL, "regexp.rematchselect", "compilation of regular expression (%s) failed at %d with %s", pat, pos, err); } /* invest in study of the r.e. */ sd = pcre_study(re, 0, &err); if (err != NULL) { pcre_free(re); BBPunfix(b->batCacheid); if (s) BBPunfix(s->batCacheid); BBPreclaim(bn); throw(MAL, "regexp.rematchselect", "study of regular expression (%s) failed with %s", pat, err); } bi = bat_iterator(b); /* iterate through the candidates */ for (BUN i = 0; i < ci.ncand; i++) { /* get the next candidate */ oid o = canditer_next(&ci); /* the candidate list has a list of OIDs which are * relative to b->hseqbase, we need to convert that to * an index relative to the start of the array in the * (tail) heap */ const char *val = BUNtvar(bi, o - b->hseqbase); /* nil values never match */ if (!strNil(val)) { pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); if (pos >= 0) { /* regular expression matched */ if (!anti) *outp++ = o; } else if (pos == PCRE_ERROR_NOMATCH) { /* regular expression didn't match */ if (anti) *outp++ = o; } else { /* error during processing */ BBPunfix(b->batCacheid); BBPunfix(s->batCacheid); BBPreclaim(bn); pcre_free_study(sd); pcre_free(re); throw(MAL, "regexp.rematchselect", "matching of regular expression (%s) failed with %d", pat, pos); } } } /* we're done with b, s, and re */ BBPunfix(b->batCacheid); if (s) BBPunfix(s->batCacheid); pcre_free_study(sd); pcre_free(re); /* set properties and size of result BAT */ BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); /* size is pointer difference */ /* the result BAT of a select operation MUST be sorted, and * all values MUST be distinct (i.e. it is a candidate list); * due to the way we created the result, we know this is the * case */ bn->tsorted = true; bn->tnosorted = 0; bn->tkey = true; bn->tseqbase = oid_nil; if (BATcount(bn) > 1) { /* if more than 1 result, it is not reverse sorted */ bn->trevsorted = false; bn->tnorevsorted = 1; /* index 1 is larger than index 0 */ /* the BAT is dense if the type is TYPE_oid, the * values are sorted in ascending order, they are all * distinct, and they form a consecutive sequence (no * missing values); we only need to check the last * condition, which we do by checking the difference * between the first and last values */ outp = (oid *) Tloc(bn, 0); /* pointer to start */ if (outp[BATcount(bn) - 1] - outp[0] == BATcount(bn) - 1) bn->tseqbase = outp[0]; } else { /* if empty or a single result, it is reverse sorted * and dense */ bn->trevsorted = true; bn->tnorevsorted = 0; bn->tseqbase = BATcount(bn) == 0 ? 0 : *(oid *) Tloc(bn, 0); } /* there are no NIL values in the result */ bn->tnil = false; bn->tnonil = true; *ret = bn->batCacheid; BBPkeepref(*ret); return MAL_SUCCEED; } char * regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti) { return do_select(ret, *bid, sid ? *sid : 0, *pat, "", *anti); } char * regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti) { return do_select(ret, *bid, sid ? *sid : 0, *pat, *flags, *anti); } static char * do_join(bat *lres, bat *rres, bat lid, bat rid, const char *flags, bat slid, bat srid, bit nil_matches, lng estimate) { BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */ BAT *bn1, *bn2; /* output BATs */ struct canditer lci; /* candidate iterator for l */ struct canditer rci; /* candidate iterator for r */ BATiter li; /* helper to loop through values */ BATiter ri; /* helper to loop through values */ oid ro; /* right OID being matched */ const char *pat; /* the regular expression being matched */ const char *err; /* error message from PCRE library */ int pos = 0; /* error position from PCRE library */ int options; /* PCRE options */ pcre *re; /* compiled regular expression */ pcre_extra *sd; /* studied regular expression */ (void) nil_matches; /* only relevant for equi-join */ if (strNil(flags)) { /* no matches when the flags is NIL * we return two empty BATs */ bn1 = BATdense(0, 0, 0); bn2 = BATdense(0, 0, 0); if (bn1 == NULL || bn2 == NULL) { BBPreclaim(bn1); BBPreclaim(bn2); throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); } return MAL_SUCCEED; } options = parseflags(flags); if (options == -1) throw(MAL, "regexp.rematchjoin", "bad flag character"); l = BATdescriptor(lid); r = BATdescriptor(rid); if (!is_bat_nil(slid)) sl = BATdescriptor(slid); if (!is_bat_nil(srid)) sr = BATdescriptor(srid); if (l == NULL || r == NULL || (!is_bat_nil(slid) && sl == NULL) || (!is_bat_nil(srid) && sr == NULL)) { /* one of the calls to BATdescriptor failed */ if (l) BBPunfix(l->batCacheid); if (r) BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); throw(MAL, "regexp.rematchjoin", RUNTIME_OBJECT_MISSING); } /* check that the BATs have the expected type: we expect str * or something compatible with str for l (the values) and str * for r (the patterns). * Note, the MAL interpreter will only call this function with * a pair of str BATs because that is the only interface that * is defined in the MAL file, so this check is superfluous. */ if (ATOMstorage(l->ttype) != TYPE_str || r->ttype != TYPE_str) { BBPunfix(l->batCacheid); BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); throw(MAL, "regexp.rematchjoin", SEMANTIC_TYPE_MISMATCH); } if (canditer_init(&lci, l, sl) == 0 || canditer_init(&rci, r, sr) == 0) { /* if either side is empty (or no candidates) the * result is empty */ BBPunfix(l->batCacheid); BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); bn1 = BATdense(0, 0, 0); bn2 = BATdense(0, 0, 0); if (bn1 == NULL || bn2 == NULL) { BBPreclaim(bn1); BBPreclaim(bn2); throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); } return MAL_SUCCEED; } /* if there is no valid estimate, use the size of the left * input as size estimate */ if (is_lng_nil(estimate) || estimate == 0) estimate = lci.ncand; /* create the output BATs */ bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT); bn2 = COLnew(0, TYPE_oid, estimate, TRANSIENT); if (bn1 == NULL || bn2 == NULL) { /* one of the calls to COLnew failed * note, BBPreclaim checks whether its argument is NULL */ BBPreclaim(bn1); BBPreclaim(bn2); BBPunfix(l->batCacheid); BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); } li = bat_iterator(l); ri = bat_iterator(r); for (BUN i = 0; i < rci.ncand; i++) { ro = canditer_next(&rci); pat = BUNtvar(ri, ro - r->hseqbase); /* nil regular expressions don't match (despite * nil_matches) */ if (strNil(pat)) continue; re = pcre_compile(pat, options, &err, &pos, NULL); sd = NULL; if (re == NULL) goto bailout; sd = pcre_study(re, 0, &err); if (err != NULL) goto bailout; /* inner loop: reset iterator, then iterate over it */ canditer_reset(&lci); for (BUN j = 0; j < lci.ncand; j++) { oid lo = canditer_next(&lci); const char *val = BUNtvar(li, lo - l->hseqbase); if (strNil(val)) continue; pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); if (pos >= 0) { /* regular expression matched */ if (BUNappend(bn1, &lo, false) != GDK_SUCCEED || BUNappend(bn2, &ro, false) != GDK_SUCCEED) goto bailout; } else if (pos != PCRE_ERROR_NOMATCH) { /* error during processing */ err = "matching of regular expression failed"; goto bailout; } } pcre_free_study(sd); pcre_free(re); } BBPunfix(l->batCacheid); BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); *lres = bn1->batCacheid; *rres = bn2->batCacheid; BBPkeepref(*lres); BBPkeepref(*rres); return MAL_SUCCEED; bailout: BBPreclaim(bn1); BBPreclaim(bn2); BBPunfix(l->batCacheid); BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); if (sd) pcre_free_study(sd); if (re) pcre_free(re); if (err) throw(MAL, "pcre.rematchjoin", "error with regular expression: %s", err); throw(MAL, "pcre.rematchjoin", GDK_EXCEPTION); } char * regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate) { return do_join(lres, rres, *lid, *rid, "", *sl, *sr, *nil_matches, *estimate); } char * regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const char **flags, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate) { return do_join(lres, rres, *lid, *rid, *flags, *sl, *sr, *nil_matches, *estimate); }