view regexp/regexp.c @ 24:d67f80012f70

typo
author Jennie Zhang <y.zhang@cwi.nl>
date Wed, 24 Jan 2018 13:06:44 +0100 (2018-01-24)
parents 499debebda5d
children f0739f6c1a43
line wrap: on
line source
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0.  If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * Copyright 2013-2018 MonetDB B.V.
 */

/* monetdb_config.h must be included as the first include file */
#include <monetdb_config.h>

/* mal_exception.h actually contains everything we need */
#include <mal_exception.h>

/* for the CANDINIT macro */
#include <gdk_cand.h> /* pre Jul2017-SP5, replace with CANDINIT definition */

/* system include files */
#include <string.h>

/* we use the PCRE library to do regular expression matching */
#include <pcre.h>

/* __declspec() must be used on Windows, but not on other systems */
#ifndef _MSC_VER
/* not Windows */
#define __declspec(x)	/* nothing */
#endif

/* these six functions are the only externally visible functions; on
 * Windows they must be exported, on other systems, declaring them as
 * extern is enough */
extern __declspec(dllexport) char *regexpmatch(bit *ret, const char **val, const char **pat);
extern __declspec(dllexport) char *regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags);
extern __declspec(dllexport) char *regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti);
extern __declspec(dllexport) char *regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti);
extern __declspec(dllexport) char *regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate);
extern __declspec(dllexport) char *regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid, const char **flags, const bat *sl, const bat *sr, const bit *nil_matches, const lng *estimate);

static int
parseflags(const char *flags)
{
	int options = PCRE_UTF8; /* MonetDB uses UTF-8 exclusively */

	if (flags) {
		while (*flags) {
			switch (*flags) {
			case 'i':	/* case insensitive */
				options |= PCRE_CASELESS;
				break;
			case 'x':	/* extended regular expressions */
				options |= PCRE_EXTENDED;
				break;
			default:
				return -1; /* indicate there was an error */
			}
			flags++;
		}
	}
	return options;
}

static char *
do_match(bit *ret, const char *val, const char *pat, const char *flags)
{
	const char *err = NULL;
	int options;
	int pos = 0;
	pcre *re;

	/* if any of the input values are nil, the result is no match */
	if (GDK_STRNIL(val) ||
	    GDK_STRNIL(pat) ||
	    GDK_STRNIL(flags)) {
		/* special case for NIL inputs: NILs don't match anything */
		*ret = 0;
		return MAL_SUCCEED;
	}

	options = parseflags(flags);
	if (options == -1)
		throw(MAL, "regexp.rematch", "bad flag character");

	re = pcre_compile(pat, options, &err, &pos, NULL);
	if (re == NULL) {
		throw(MAL, "regexp.rematch",
		      "compilation of regular expression (%s) failed at %d with %s",
		      pat, pos, err);
	}
	pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0);
	pcre_free(re);
	if (pos < PCRE_ERROR_NOMATCH) {
		throw(MAL, "regexp.rematch",
		      "matching of regular expression (%s) failed with %d",
		      pat, pos);
	}
	*ret = pos >= 0;
	return MAL_SUCCEED;
}

char *
regexpmatch(bit *ret, const char **val, const char **pat)
{
	return do_match(ret, *val, *pat, "");
}

char *
regexpmatchf(bit *ret, const char **val, const char **pat, const char **flags)
{
	return do_match(ret, *val, *pat, *flags);
}

static char *
do_select(bat *ret, bat bid, bat sid, const char *pat, const char *flags, bit anti)
{
	BAT *b, *s = NULL;	/* input BAT and optional candidate list */

	BUN start, end;		/* start and end index */
	BUN cnt;		/* difference between start and end (unused) */
	const oid *cand, *candend; /* start and end of candidate list */
	BATiter bi;		/* helper to loop through values */

	BAT *bn;		/* result BAT */
	oid *outp;			/* pointer through which we add to result */

	const char *err = NULL;	/* error message from PCRE library */
	int pos = 0;		/* error position from PCRE library */
	int options;		/* PCRE options */
	pcre *re;		/* compiled regular expression */
	pcre_extra *sd;		/* studied regular expression */

	/* if any of the input values are nil, the result is no match */
	if (GDK_STRNIL(pat) || GDK_STRNIL(flags)) {
		/* no matches when the pattern or the flags is NIL
		 * we return an empty BAT of the correct type */
		if ((bn = BATdense(0, 0, 0)) == NULL)
			throw(MAL, "regexp.rematchselect", GDK_EXCEPTION);
		*ret = bn->batCacheid;
		BBPkeepref(*ret);
		return MAL_SUCCEED;
	}
	options = parseflags(flags);
	if (options == -1)
		throw(MAL, "regexp.rematchselect", "bad flag character");

	/* from the BAT IDs we need to get the BAT descriptors, making
	 * sure that the data of the BATs are loaded into memory */
	if ((b = BATdescriptor(bid)) == NULL) {
		throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING);
	}
	if (sid != 0 && sid != bat_nil &&
	    (s = BATdescriptor(sid)) == NULL) {
		BBPunfix(b->batCacheid);
		throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING);
	}

	/* allocate a result BAT; the capacity we ask for is the
	 * maximum potential result size (i.e. the size of the
	 * candidate list if there is one, else the size of the input
	 * BAT b) */
	bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT);
	if (bn == NULL) {
		BBPunfix(b->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		throw(MAL, "regexp.rematchselect", GDK_EXCEPTION);
	}

	/* Position outp at the start of the result array.
	 * We know the array is large enough even if every value were
	 * to match, so we don't need to check for that. */
	outp = (oid *) Tloc(bn, 0);

	/* compile the regular expression */
	re = pcre_compile(pat, options, &err, &pos, NULL);
	if (re == NULL) {
		BBPunfix(b->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		BBPreclaim(bn);
		throw(MAL, "regexp.rematchselect",
		      "compilation of regular expression (%s) failed at %d with %s",
		      pat, pos, err);
	}
	/* invest in study of the r.e. */
	sd = pcre_study(re, 0, &err);
	if (err != NULL) {
		pcre_free(re);
		BBPunfix(b->batCacheid);
		if (s)
			BBPunfix(s->batCacheid);
		BBPreclaim(bn);
		throw(MAL, "regexp.rematchselect",
		      "study of regular expression (%s) failed with %s",
		      pat, err);
	}

	/* initialize the start, end, cnt, cand, and candend variables
	 * from b and s*/
	CANDINIT(b, s, start, end, cnt, cand, candend);

	/* now, start and end are the limits in b that we need to look
	 * at, and if set, cand and candend are the beginning and end
	 * of the list of OIDs of b that we need to consider */

	bi = bat_iterator(b);

	if (cand) {
		/* there is a non-dense candidate list, we run through
		 * it and only check values in b that are referenced
		 * in this candidate list */
		while (cand < candend) {
			/* the candidate list has a list of OIDs which
			 * are relative to b->hseqbase, we need to
			 * convert that to an index relative to the
			 * start of the array in the (tail) heap */
			const char *val = BUNtvar(bi, *cand - b->hseqbase);
			/* nil values never match */
			if (!GDK_STRNIL(val)) {
				pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
				if (pos >= 0) {
					/* regular expression matched */
					if (!anti)
						*outp++ = *cand;
				} else if (pos == PCRE_ERROR_NOMATCH) {
					/* regular expression didn't match */
					if (anti)
						*outp++ = *cand;
				} else {
					/* error during processing */
					BBPunfix(b->batCacheid);
					BBPunfix(s->batCacheid);
					BBPreclaim(bn);
					pcre_free_study(sd);
					pcre_free(re);
					throw(MAL, "regexp.rematchselect",
					      "matching of regular expression (%s) failed with %d",
					      pat, pos);
				}
			}
			cand++;
		}
		/* we're done with s */
		BBPunfix(s->batCacheid);
	} else {
		/* we're done with s */
		if (s) {
			/* there is a dense candidate list, we don't
			 * need it anymore since we extracted start
			 * and end */
			BBPunfix(s->batCacheid);
		}
		while (start < end) {
			const char *val = BUNtvar(bi, start);
			/* nil values never match */
			if (!GDK_STRNIL(val)) {
				pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
				/* note that we need to convert the
				 * index relative to the start of the
				 * array to a value relative to
				 * b->hseqbase (opposite from what
				 * happens above) */
				if (pos >= 0) {
					/* regular expression matched */
					if (!anti)
						*outp++ = b->hseqbase + start;
				} else if (pos == PCRE_ERROR_NOMATCH) {
					/* regular expression didn't match */
					if (anti)
						*outp++ = b->hseqbase + start;
				} else {
					/* error during processing */
					BBPunfix(b->batCacheid);
					BBPreclaim(bn);
					pcre_free_study(sd);
					pcre_free(re);
					throw(MAL, "regexp.rematchselect",
					      "matching of regular expression (%s) failed with %d",
					      pat, pos);
				}
			}
			start++;
		}
	}

	/* we're done with b and re */
	BBPunfix(b->batCacheid);
	pcre_free_study(sd);
	pcre_free(re);

	/* set properties and size of result BAT */
	BATsetcount(bn, (BUN) (outp - (oid *) Tloc(bn, 0))); /* size is pointer difference */
	/* the result BAT of a select operation MUST be sorted, and
	 * all values MUST be distinct (i.e. it is a candidate list);
	 * due to the way we created the result, we know this is the
	 * case */
	bn->tsorted = 1;
	bn->tnosorted = 0;
	bn->tkey = 1;
	if (BATcount(bn) > 1) {
		/* if more than 1 result, it is not reverse sorted */
		bn->trevsorted = 0;
		bn->tnorevsorted = 1; /* index 1 is larger than index 0 */
		/* the BAT is dense if the type is TYPE_oid, the
		 * values are sorted in ascending order, they are all
		 * distinct, and they form a consecutive sequence (no
		 * missing values); we only need to check the last
		 * condition, which we do by checking the difference
		 * between the first and last values */
		outp = (oid *) Tloc(bn, 0); /* pointer to start */
		bn->tdense = outp[BATcount(bn) - 1] - outp[0] == BATcount(bn) - 1;
	} else {
		/* if empty or a single result, it is reverse sorted
		 * and dense */
		bn->trevsorted = 1;
		bn->tnorevsorted = 0;
		bn->tdense = 1;
	}
	if (bn->tdense) {
		/* if dense, we must also set tseqbase to the first value */
		if (BATcount(bn) == 0)
			bn->tseqbase = 0; /* no first value */
		else
			bn->tseqbase = * (oid *) Tloc(bn, 0);
	} else {
		bn->tseqbase = oid_nil;
	}
	/* there are no NIL values in the result */
	bn->tnil = 0;
	bn->tnonil = 1;

	*ret = bn->batCacheid;
	BBPkeepref(*ret);
	return MAL_SUCCEED;
}

char *
regexpmatchselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const bit *anti)
{
	return do_select(ret, *bid, sid ? *sid : 0, *pat, "", *anti);
}


char *
regexpmatchfselect(bat *ret, const bat *bid, const bat *sid, const char **pat, const char **flags, const bit *anti)
{
	return do_select(ret, *bid, sid ? *sid : 0, *pat, *flags, *anti);
}

static char *
do_join(bat *lres, bat *rres, bat lid, bat rid, const char *flags,
	bat slid, bat srid, bit nil_matches, lng estimate)
{
	BAT *l, *r, *sl = NULL, *sr = NULL; /* input BATs */
	BAT *bn1, *bn2;		/* output BATs */

	BUN lstart, lend;	/* start and end index */
	BUN lcnt;		/* difference between start and end (unused) */
	const oid *lcand, *lcandend; /* start and end of candidate list */
	BATiter li;		/* helper to loop through values */

	BUN rstart, rend;	/* start and end index */
	BUN rcnt;		/* difference between start and end (unused) */
	const oid *rcand, *rcandend; /* start and end of candidate list */
	BATiter ri;		/* helper to loop through values */
	oid ro;			/* right OID being matched */

	const char *pat;	/* the regular expression being matched */
	const char *err;	/* error message from PCRE library */
	int pos = 0;		/* error position from PCRE library */
	int options;		/* PCRE options */
	pcre *re;		/* compiled regular expression */
	pcre_extra *sd;		/* studied regular expression */

	(void) nil_matches;	/* only relevant for equi-join */

	if (GDK_STRNIL(flags)) {
		/* no matches when the flags is NIL
		 * we return two empty BATs */
		bn1 = BATdense(0, 0, 0);
		bn2 = BATdense(0, 0, 0);
		if (bn1 == NULL || bn2 == NULL) {
			BBPreclaim(bn1);
			BBPreclaim(bn2);
			throw(MAL, "regexp.rematchjoin", GDK_EXCEPTION);
		}
		return MAL_SUCCEED;
	}
	options = parseflags(flags);
	if (options == -1)
		throw(MAL, "regexp.rematchjoin", "bad flag character");

	l = BATdescriptor(lid);
	r = BATdescriptor(rid);
	if (slid != 0 && slid != bat_nil)
		sl = BATdescriptor(slid);
	if (srid != 0 && srid != bat_nil)
		sr = BATdescriptor(srid);
	if (l == NULL || r == NULL ||
	    (slid != 0 && slid != bat_nil && sl == NULL) ||
	    (srid != 0 && srid != bat_nil && sr == NULL)) {
		/* one of the calls to BATdescriptor failed */
		if (l)
			BBPunfix(l->batCacheid);
		if (r)
			BBPunfix(r->batCacheid);
		if (sl)
			BBPunfix(sl->batCacheid);
		if (sr)
			BBPunfix(sr->batCacheid);
		throw(MAL, "regexp.rematchselect", RUNTIME_OBJECT_MISSING);
	}

	/* if there is no valid estimate, use the size of the left
	 * input as size estimate */
	if (estimate == lng_nil || estimate == 0)
		estimate = sl ? BATcount(sl) : BATcount(l);

	/* create the output BATs */
	bn1 = COLnew(0, TYPE_oid, estimate, TRANSIENT);
	bn2 = COLnew(0, TYPE_oid, estimate, TRANSIENT);
	if (bn1 == NULL || bn2 == NULL) {
		/* one of the calls to COLnew failed
		 * note, BBPreclaim check whether its argument is NULL */
		BBPreclaim(bn1);
		BBPreclaim(bn2);
		BBPunfix(l->batCacheid);
		BBPunfix(r->batCacheid);
		if (sl)
			BBPunfix(sl->batCacheid);
		if (sr)
			BBPunfix(sr->batCacheid);
		throw(MAL, "regexp.rematchselect", GDK_EXCEPTION);
	}

	/* extract boundaries and candidate lists from inputs */
	CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
	CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);

	li = bat_iterator(l);
	ri = bat_iterator(r);

	/* if there is a candidate list (sl or sr is not NULL), but it
	 * is dense (lcand/rcand is NULL), we don't need the BAT
	 * descriptor anymore, the start and end values suffice */
	if (sl && lcand == NULL) {
		BBPunfix(sl->batCacheid);
		sl = NULL;
	}
	if (sr && rcand == NULL) {
		BBPunfix(sr->batCacheid);
		sr = NULL;
	}

	for (;;) {
		if (rcand) {
			if (rcand == rcandend)
				break;
			ro = *rcand++;
			pat = BUNtvar(ri, ro - r->hseqbase);
		} else {
			if (rstart == rend)
				break;
			ro = rstart + r->hseqbase;
			pat = BUNtvar(ri, rstart);
			rstart++;
		}
		/* nil regular expressions don't match (despite
		 * nil_matches) */
		if (GDK_STRNIL(pat))
			continue;
		re = pcre_compile(pat, options, &err, &pos, NULL);
		sd = NULL;
		if (re == NULL)
			goto bailout;
		sd = pcre_study(re, 0, &err);
		if (err != NULL)
			goto bailout;
		/* inner loop: try to be more efficient than outer
		 * loop, so test for lcand once instead of each
		 * iteration */
		if (lcand) {
			for (const oid *cand = lcand; cand < lcandend; cand++) {
				const char *val = BUNtvar(li, *cand - l->hseqbase);
				if (GDK_STRNIL(val))
					continue;
				pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
				if (pos >= 0) {
					/* regular expression matched */
					if (BUNappend(bn1, cand, FALSE) != GDK_SUCCEED ||
					    BUNappend(bn2, &ro, FALSE) != GDK_SUCCEED)
						goto bailout;
				} else if (pos != PCRE_ERROR_NOMATCH) {
					/* error during processing */
					err = "matching of regular expression failed";
					goto bailout;
				}
			}
		} else {
			for (BUN start = lstart; start < lend; start++) {
				const char *val = BUNtvar(li, start);
				if (GDK_STRNIL(val))
					continue;
				pos = pcre_exec(re, sd, val, (int) strlen(val), 0, 0, NULL, 0);
				if (pos >= 0) {
					/* regular expression matched */
					oid lo = start + l->hseqbase;
					if (BUNappend(bn1, &lo, FALSE) != GDK_SUCCEED ||
					    BUNappend(bn2, &ro, FALSE) != GDK_SUCCEED)
						goto bailout;
				} else if (pos != PCRE_ERROR_NOMATCH) {
					/* error during processing */
					err = "matching of regular expression failed";
					goto bailout;
				}
			}
		}
		pcre_free_study(sd);
		pcre_free(re);
	}

	BBPunfix(l->batCacheid);
	BBPunfix(r->batCacheid);
	if (sl)
		BBPunfix(sl->batCacheid);
	if (sr)
		BBPunfix(sr->batCacheid);
	*lres = bn1->batCacheid;
	*rres = bn2->batCacheid;
	BBPkeepref(*lres);
	BBPkeepref(*rres);
	return MAL_SUCCEED;

  bailout:
	BBPreclaim(bn1);
	BBPreclaim(bn2);
	BBPunfix(l->batCacheid);
	BBPunfix(r->batCacheid);
	if (sl)
		BBPunfix(sl->batCacheid);
	if (sr)
		BBPunfix(sr->batCacheid);
	if (sd)
		pcre_free_study(sd);
	if (re)
		pcre_free(re);
	if (err)
		throw(MAL, "pcre.rematchjoin",
		      "error with regular expression: %s", err);
	throw(MAL, "pcre.rematchjoin", GDK_EXCEPTION);
}

char *
regexpmatchjoin(bat *lres, bat *rres, const bat *lid, const bat *rid,
		const bat *sl, const bat *sr, const bit *nil_matches,
		const lng *estimate)
{
	return do_join(lres, rres, *lid, *rid, "", *sl, *sr,
		       *nil_matches, *estimate);
}

char *
regexpmatchfjoin(bat *lres, bat *rres, const bat *lid, const bat *rid,
		 const char **flags, const bat *sl, const bat *sr,
		 const bit *nil_matches, const lng *estimate)
{
	return do_join(lres, rres, *lid, *rid, *flags, *sl, *sr,
		       *nil_matches, *estimate);
}