Mercurial > hg > MonetDB-extend
view regexp/regexp.c @ 24:d67f80012f70
typo
author | Jennie Zhang <y.zhang@cwi.nl> |
---|---|
date | Wed, 24 Jan 2018 13:06:44 +0100 (2018-01-24) |
parents | 499debebda5d |
children | f0739f6c1a43 |
line wrap: on
line source
/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 2013-2018 MonetDB B.V. */ /* monetdb_config.h must be included as the first include file */ #include <monetdb_config.h> /* mal_exception.h actually contains everything we need */ #include <mal_exception.h> /* for the CANDINIT macro */ #include <gdk_cand.h> /* pre Jul2017-SP5, replace with CANDINIT definition */ /* system include files */ #include <string.h> /* we use the PCRE library to do regular expression matching */ #include <pcre.h> /* __declspec() must be used on Windows, but not on other systems */ #ifndef _MSC_VER /* not Windows */ #define __declspec(x) /* nothing */ #endif /* these six functions are the only externally visible functions; on * Windows they must be exported, on other systems, declaring them as * extern is enough */ extern __declspec(dllexport) char *regexpmatch(bit *ret, const char **val, const char **pat); extern __declspec(dllexport) char *regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags); extern __declspec(dllexport) char *regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti); extern __declspec(dllexport) char *regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti); extern __declspec(dllexport) char *regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate); extern __declspec(dllexport) char *regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const char **flags, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate); static int parseflags(const char *flags) { int options = PCRE_UTF8; /* MonetDB uses UTF-8 exclusively */ if (flags) { while (*flags) { switch (*flags) { case 'i': /* case insensitive */ options |= PCRE_CASELESS; break; case 'x': /* extended regular expressions */ options |= PCRE_EXTENDED; break; default: return -1; /* indicate there was an error */ } flags++; } } return options; } static char * do_match(bit *ret, const char *val, const char *pat, const char *flags) { const char *err = NULL; int options; int pos = 0; pcre *re; /* if any of the input values are nil, the result is no match */ if (GDK_STRNIL(val) || GDK_STRNIL(pat) || GDK_STRNIL(flags)) { /* special case for NIL inputs: NILs don't match anything */ *ret = 0; return MAL_SUCCEED; } options = parseflags(flags); if (options == -1) throw(MAL, "regexp.rematch", "bad flag character"); re = pcre_compile(pat, options, &err, &pos, NULL); if (re == NULL) { throw(MAL, "regexp.rematch", "compilation of regular expression (%s) failed at %d with %s", pat, pos, err); } pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0); pcre_free(re); if (pos < PCRE_ERROR_NOMATCH) { throw(MAL, "regexp.rematch", "matching of regular expression (%s) failed with %d", pat, pos); } *ret = pos >= 0; return MAL_SUCCEED; } char * regexpmatch(bit *ret, const char **val, const char **pat) { return do_match(ret, *val, *pat, ""); } char * regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags) { return do_match(ret, *val, *pat, *flags); } static char * do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti) { BAT *b, *s = NULL; /* input BAT and optional candidate list */ BUN start, end; /* start and end index */ BUN cnt; /* difference between start and end (unused) */ const oid *cand, *candend; /* start and end of candidate list */ BATiter bi; /* helper to loop through values */ BAT *bn; /* result BAT */ oid *outp; /* pointer through which we add to result */ const char *err = NULL; /* error message from PCRE library */ int pos = 0; /* error position from PCRE library */ int options; /* PCRE options */ pcre *re; /* compiled regular expression */ pcre_extra *sd; /* studied regular expression */ /* if any of the input values are nil, the result is no match */ if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) { /* no matches when the pattern or the flags is NIL * we return an empty BAT of the correct type */ if ((bn = BATdense(0, 0, 0)) == NULL) throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); *ret = bn->batCacheid; BBPkeepref(*ret); return MAL_SUCCEED; } options = parseflags(flags); if (options == -1) throw(MAL, "regexp.rematchselect", "bad flag character"); /* from the BAT IDs we need to get the BAT descriptors, making * sure that the data of the BATs are loaded into memory */ if ((b = BATdescriptor(bid)) == NULL) { throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); } if (sid != 0 && sid != bat_nil && (s = BATdescriptor(sid)) == NULL) { BBPunfix(b->batCacheid); throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); } /* allocate a result BAT; the capacity we ask for is the * maximum potential result size (i.e. the size of the * candidate list if there is one, else the size of the input * BAT b) */ bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); if (bn == NULL) { BBPunfix(b->batCacheid); if (s) BBPunfix(s->batCacheid); throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); } /* Position outp at the start of the result array. * We know the array is large enough even if every value were * to match, so we don't need to check for that. */ outp = (oid *) Tloc(bn, 0); /* compile the regular expression */ re = pcre_compile(pat, options, &err, &pos, NULL); if (re == NULL) { BBPunfix(b->batCacheid); if (s) BBPunfix(s->batCacheid); BBPreclaim(bn); throw(MAL, "regexp.rematchselect", "compilation of regular expression (%s) failed at %d with %s", pat, pos, err); } /* invest in study of the r.e. */ sd = pcre_study(re, 0, &err); if (err != NULL) { pcre_free(re); BBPunfix(b->batCacheid); if (s) BBPunfix(s->batCacheid); BBPreclaim(bn); throw(MAL, "regexp.rematchselect", "study of regular expression (%s) failed with %s", pat, err); } /* initialize the start, end, cnt, cand, and candend variables * from b and s*/ CANDINIT(b, s, start, end, cnt, cand, candend); /* now, start and end are the limits in b that we need to look * at, and if set, cand and candend are the beginning and end * of the list of OIDs of b that we need to consider */ bi = bat_iterator(b); if (cand) { /* there is a non-dense candidate list, we run through * it and only check values in b that are referenced * in this candidate list */ while (cand < candend) { /* the candidate list has a list of OIDs which * are relative to b->hseqbase, we need to * convert that to an index relative to the * start of the array in the (tail) heap */ const char *val = BUNtvar(bi, *cand - b->hseqbase); /* nil values never match */ if (!GDK_STRNIL(val)) { pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); if (pos >= 0) { /* regular expression matched */ if (!anti) *outp++ = *cand; } else if (pos == PCRE_ERROR_NOMATCH) { /* regular expression didn't match */ if (anti) *outp++ = *cand; } else { /* error during processing */ BBPunfix(b->batCacheid); BBPunfix(s->batCacheid); BBPreclaim(bn); pcre_free_study(sd); pcre_free(re); throw(MAL, "regexp.rematchselect", "matching of regular expression (%s) failed with %d", pat, pos); } } cand++; } /* we're done with s */ BBPunfix(s->batCacheid); } else { /* we're done with s */ if (s) { /* there is a dense candidate list, we don't * need it anymore since we extracted start * and end */ BBPunfix(s->batCacheid); } while (start < end) { const char *val = BUNtvar(bi, start); /* nil values never match */ if (!GDK_STRNIL(val)) { pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); /* note that we need to convert the * index relative to the start of the * array to a value relative to * b->hseqbase (opposite from what * happens above) */ if (pos >= 0) { /* regular expression matched */ if (!anti) *outp++ = b->hseqbase + start; } else if (pos == PCRE_ERROR_NOMATCH) { /* regular expression didn't match */ if (anti) *outp++ = b->hseqbase + start; } else { /* error during processing */ BBPunfix(b->batCacheid); BBPreclaim(bn); pcre_free_study(sd); pcre_free(re); throw(MAL, "regexp.rematchselect", "matching of regular expression (%s) failed with %d", pat, pos); } } start++; } } /* we're done with b and re */ BBPunfix(b->batCacheid); pcre_free_study(sd); pcre_free(re); /* set properties and size of result BAT */ BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); /* size is pointer difference */ /* the result BAT of a select operation MUST be sorted, and * all values MUST be distinct (i.e. it is a candidate list); * due to the way we created the result, we know this is the * case */ bn->tsorted = 1; bn->tnosorted = 0; bn->tkey = 1; if (BATcount(bn) > 1) { /* if more than 1 result, it is not reverse sorted */ bn->trevsorted = 0; bn->tnorevsorted = 1; /* index 1 is larger than index 0 */ /* the BAT is dense if the type is TYPE_oid, the * values are sorted in ascending order, they are all * distinct, and they form a consecutive sequence (no * missing values); we only need to check the last * condition, which we do by checking the difference * between the first and last values */ outp = (oid *) Tloc(bn, 0); /* pointer to start */ bn->tdense = outp[BATcount(bn) - 1] - outp[0] == BATcount(bn) - 1; } else { /* if empty or a single result, it is reverse sorted * and dense */ bn->trevsorted = 1; bn->tnorevsorted = 0; bn->tdense = 1; } if (bn->tdense) { /* if dense, we must also set tseqbase to the first value */ if (BATcount(bn) == 0) bn->tseqbase = 0; /* no first value */ else bn->tseqbase = * (oid *) Tloc(bn, 0); } else { bn->tseqbase = oid_nil; } /* there are no NIL values in the result */ bn->tnil = 0; bn->tnonil = 1; *ret = bn->batCacheid; BBPkeepref(*ret); return MAL_SUCCEED; } char * regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti) { return do_select(ret, *bid, sid ? *sid : 0, *pat, "", *anti); } char * regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti) { return do_select(ret, *bid, sid ? *sid : 0, *pat, *flags, *anti); } static char * do_join(bat *lres, bat *rres, bat lid, bat rid, const char *flags, bat slid, bat srid, bit nil_matches, lng estimate) { BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */ BAT *bn1, *bn2; /* output BATs */ BUN lstart, lend; /* start and end index */ BUN lcnt; /* difference between start and end (unused) */ const oid *lcand, *lcandend; /* start and end of candidate list */ BATiter li; /* helper to loop through values */ BUN rstart, rend; /* start and end index */ BUN rcnt; /* difference between start and end (unused) */ const oid *rcand, *rcandend; /* start and end of candidate list */ BATiter ri; /* helper to loop through values */ oid ro; /* right OID being matched */ const char *pat; /* the regular expression being matched */ const char *err; /* error message from PCRE library */ int pos = 0; /* error position from PCRE library */ int options; /* PCRE options */ pcre *re; /* compiled regular expression */ pcre_extra *sd; /* studied regular expression */ (void) nil_matches; /* only relevant for equi-join */ if (GDK_STRNIL(flags)) { /* no matches when the flags is NIL * we return two empty BATs */ bn1 = BATdense(0, 0, 0); bn2 = BATdense(0, 0, 0); if (bn1 == NULL || bn2 == NULL) { BBPreclaim(bn1); BBPreclaim(bn2); throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION); } return MAL_SUCCEED; } options = parseflags(flags); if (options == -1) throw(MAL, "regexp.rematchjoin", "bad flag character"); l = BATdescriptor(lid); r = BATdescriptor(rid); if (slid != 0 && slid != bat_nil) sl = BATdescriptor(slid); if (srid != 0 && srid != bat_nil) sr = BATdescriptor(srid); if (l == NULL || r == NULL || (slid != 0 && slid != bat_nil && sl == NULL) || (srid != 0 && srid != bat_nil && sr == NULL)) { /* one of the calls to BATdescriptor failed */ if (l) BBPunfix(l->batCacheid); if (r) BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING); } /* if there is no valid estimate, use the size of the left * input as size estimate */ if (estimate == lng_nil || estimate == 0) estimate = sl ? BATcount(sl) : BATcount(l); /* create the output BATs */ bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT); bn2 = COLnew(0, TYPE_oid, estimate, TRANSIENT); if (bn1 == NULL || bn2 == NULL) { /* one of the calls to COLnew failed * note, BBPreclaim check whether its argument is NULL */ BBPreclaim(bn1); BBPreclaim(bn2); BBPunfix(l->batCacheid); BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); throw(MAL, "regexp.rematchselect", GDK_EXCEPTION); } /* extract boundaries and candidate lists from inputs */ CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend); CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend); li = bat_iterator(l); ri = bat_iterator(r); /* if there is a candidate list (sl or sr is not NULL), but it * is dense (lcand/rcand is NULL), we don't need the BAT * descriptor anymore, the start and end values suffice */ if (sl && lcand == NULL) { BBPunfix(sl->batCacheid); sl = NULL; } if (sr && rcand == NULL) { BBPunfix(sr->batCacheid); sr = NULL; } for (;;) { if (rcand) { if (rcand == rcandend) break; ro = *rcand++; pat = BUNtvar(ri, ro - r->hseqbase); } else { if (rstart == rend) break; ro = rstart + r->hseqbase; pat = BUNtvar(ri, rstart); rstart++; } /* nil regular expressions don't match (despite * nil_matches) */ if (GDK_STRNIL(pat)) continue; re = pcre_compile(pat, options, &err, &pos, NULL); sd = NULL; if (re == NULL) goto bailout; sd = pcre_study(re, 0, &err); if (err != NULL) goto bailout; /* inner loop: try to be more efficient than outer * loop, so test for lcand once instead of each * iteration */ if (lcand) { for (const oid *cand = lcand; cand < lcandend; cand++) { const char *val = BUNtvar(li, *cand - l->hseqbase); if (GDK_STRNIL(val)) continue; pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); if (pos >= 0) { /* regular expression matched */ if (BUNappend(bn1, cand, FALSE) != GDK_SUCCEED || BUNappend(bn2, &ro, FALSE) != GDK_SUCCEED) goto bailout; } else if (pos != PCRE_ERROR_NOMATCH) { /* error during processing */ err = "matching of regular expression failed"; goto bailout; } } } else { for (BUN start = lstart; start < lend; start++) { const char *val = BUNtvar(li, start); if (GDK_STRNIL(val)) continue; pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0); if (pos >= 0) { /* regular expression matched */ oid lo = start + l->hseqbase; if (BUNappend(bn1, &lo, FALSE) != GDK_SUCCEED || BUNappend(bn2, &ro, FALSE) != GDK_SUCCEED) goto bailout; } else if (pos != PCRE_ERROR_NOMATCH) { /* error during processing */ err = "matching of regular expression failed"; goto bailout; } } } pcre_free_study(sd); pcre_free(re); } BBPunfix(l->batCacheid); BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); *lres = bn1->batCacheid; *rres = bn2->batCacheid; BBPkeepref(*lres); BBPkeepref(*rres); return MAL_SUCCEED; bailout: BBPreclaim(bn1); BBPreclaim(bn2); BBPunfix(l->batCacheid); BBPunfix(r->batCacheid); if (sl) BBPunfix(sl->batCacheid); if (sr) BBPunfix(sr->batCacheid); if (sd) pcre_free_study(sd); if (re) pcre_free(re); if (err) throw(MAL, "pcre.rematchjoin", "error with regular expression: %s", err); throw(MAL, "pcre.rematchjoin", GDK_EXCEPTION); } char * regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate) { return do_join(lres, rres, *lid, *rid, "", *sl, *sr, *nil_matches, *estimate); } char * regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const char **flags, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate) { return do_join(lres, rres, *lid, *rid, *flags, *sl, *sr, *nil_matches, *estimate); }