Re: MonetDB: default - Move tokenizer to the attic
Martin, et al., please be aware that this means that also monetdb5/extras/rdf (aka. --enable-rdf) does no longer compile, as it depends on the (now gone) tokenizer. Best, Stefan ----- Original Message -----
Changeset: 155c3a3fcfdb for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=155c3a3fcfdb Removed Files: monetdb5/modules/mal/tokenizer.c monetdb5/modules/mal/tokenizer.h monetdb5/modules/mal/tokenizer.mal Modified Files: monetdb5/modules/mal/Makefile.ag Branch: default Log Message:
Move tokenizer to the attic The experiment to organise urls using a variation of graph-based tokenization requires a major overhaul to support the void-headed approach.
diffs (truncated from 737 to 300 lines):
diff --git a/monetdb5/modules/mal/Makefile.ag b/monetdb5/modules/mal/Makefile.ag --- a/monetdb5/modules/mal/Makefile.ag +++ b/monetdb5/modules/mal/Makefile.ag @@ -54,7 +54,6 @@ lib_mal = { sabaoth.c sabaoth.h \ sysmon.c sysmon.h \ tablet.c tablet.h \ - tokenizer.c tokenizer.h \ trader.c trader.h \ transaction.c \ txtsim.c txtsim.h \ @@ -76,7 +75,7 @@ headers_mal = { mal_mapi.mal sabaoth.mal remote.mal \ txtsim.mal recycle.mal \ cluster.mal trader.mal \ - tokenizer.mal zorder.mal sample.mal json_util.mal \ + zorder.mal sample.mal json_util.mal \ calc.mal batcalc.mal batmtime.mal querylog.mal sysmon.mal }
@@ -84,7 +83,7 @@ EXTRA_DIST = batExtensions.mal iterator. groupby.mal mal_init.mal manual.mal mkey.mal manifold.mal pcre.mal \ profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal \ transaction.mal txtsim.mal tablet.mal tablet.h sample.mal json_util.mal \ - mal_mapi.mal mat.mal tokenizer.mal pqueue.mal calc.mal \ + mal_mapi.mal mat.mal pqueue.mal calc.mal \ batcalc.mal batmtime.mal querylog.mal sysmon.mal
EXTRA_DIST_DIR = Tests diff --git a/monetdb5/modules/mal/tokenizer.c b/monetdb5/modules/mal/tokenizer.c deleted file mode 100644 --- a/monetdb5/modules/mal/tokenizer.c +++ /dev/null @@ -1,585 +0,0 @@ -/* - * The contents of this file are subject to the MonetDB Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.monetdb.org/Legal/MonetDBLicense - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the - * License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is the MonetDB Database System. - * - * The Initial Developer of the Original Code is CWI. - * Portions created by CWI are Copyright (C) 1997-July 2008 CWI. - * Copyright August 2008-2014 MonetDB B.V. - * All Rights Reserved. - */ - -/* - * author Lefteris Sidirourgos - * Tokenizer - * This module implements a vertical fragmented tokenizer for strings. - * It is based on the ideas of the urlbox module by mk. - * - * The input string is tokenized according to a separator character. - * Each token is inserted to the next BAT with the same order of - * appearance in the string. We currently support 255 tokens in each - * string as this module is intended for use with short and similar - * strings such as URLs. In addition we maintain a 2-dimensional index - * that points to the depth and height of the last token of each string. - * The 2-dimensional index is combined to one BAT where the 8 least - * significant bits represent the depth, and the rest bits the height. - * - * The tokenizer can be accessed in two ways. Given the oid retrieve the - * re-constructed string, or given a string return its oid if present, - * otherwise nil. - * - * Strings can be added either in batch (from a file or a bat of - * strings) and by appending a single string. Duplicate elimination is - * always performed. - * - * There can be only one tokenizer open at the same time. This is - * achieved by setting a TRANSaction bat. This might change in the - * future. However there can be more than one tokenizers stored in the - * disk, each of which is identified by its name (usually the name of - * the active schema of the db). These administrative issues and - * security aspects (e.g., opening a tokenizer of a different schema) - * should be addressed more thoroughly. - */ -#include "monetdb_config.h" -#include "bat5.h" -#include "tokenizer.h" -#include "mal_linker.h" - -#define MAX_TKNZR_DEPTH 256 -#define INDEX MAX_TKNZR_DEPTH -static int tokenDepth = 0; -static BAT *tokenBAT[MAX_TKNZR_DEPTH + 1]; -static BAT *TRANS = NULL; /* the catalog of tokenizers */ -static char name[128]; - -#if SIZEOF_OID == 4 /* 32-bit oid */ -#define MAX_h ((((oid) 1) << 23) - 1) -#else /* 64-bit oid */ -#define MAX_h ((((oid) 1) << 55) - 1) -#endif - -#define COMP(h, d) ((h << 8) | (d & 255)) -#define GET_d(x) ((sht) ((x) & 255)) -#define GET_h(x) ((x) >> 8) - -static int prvlocate(BAT* b, oid *prv, str part) -{ - BAT *m = BATmirror(b); - BATiter mi = bat_iterator(m); - BUN p; - if (m->H->hash == NULL) - BAThash(m, 2 * BATcount(m)); - HASHloop_str(mi, m->H->hash, p, part) - { - if (*((oid *) BUNtail(mi, p)) == *prv) { - *prv = (oid) p; - return TRUE; - } - } - return FALSE; -} - -str -TKNZRopen(int *ret, str *in) -{ - int depth, r; - bat idx; - str batname = NULL; - BAT *b; - - (void) ret; - if (strlen(*in) > 127) - throw(MAL, "tokenizer.open", - ILLEGAL_ARGUMENT " tokenizer name too long"); - - MT_lock_set(&mal_contextLock, "tokenizer"); - if (TRANS != NULL) { - MT_lock_unset(&mal_contextLock, "tokenizer"); - throw(MAL, "tokenizer.open", "Another tokenizer is already open"); - } - - for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) { - tokenBAT[depth] = 0; - } - tokenDepth = 0; - - TRANS = BATnew(TYPE_void, TYPE_str, MAX_TKNZR_DEPTH + 1); - if (TRANS == NULL) { - MT_lock_unset(&mal_contextLock, "tokenizer"); - throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL); - } - /* now we are sure that none overwrites the tokenizer table*/ - MT_lock_unset(&mal_contextLock, "tokenizer"); - BATseqbase(TRANS, 0); - - snprintf(name, 128, "%s", *in); - batname = (str) GDKmalloc(134 * sizeof(char)); - if( batname == NULL) - throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL); - - snprintf(batname, 134, "%s_index", name); - idx = BBPindex(batname); - - if (idx == 0) { /* new tokenizer */ - b = BATnew(TYPE_void, TYPE_oid, 1024); - if (b == NULL) - throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL); - BATkey(b, FALSE); - BATseqbase(b, 0); - tokenBAT[INDEX] = b; - if (BKCsetName(&r, (int *) &(b->batCacheid), (str *) &batname) != MAL_SUCCEED) - throw(MAL, "tokenizer.open", OPERATION_FAILED); - if (BKCsetPersistent(&r, (int *) &(b->batCacheid)) != MAL_SUCCEED) - throw(MAL, "tokenizer.open", OPERATION_FAILED); - BUNappend(TRANS, batname, FALSE); - } else { /* existing tokenizer */ - tokenBAT[INDEX] = BATdescriptor(idx); - BUNappend(TRANS, batname, FALSE); - - for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) { - snprintf(batname, 128, "%s_%d", name, depth); - idx = BBPindex(batname); - if (idx == 0) - break; - tokenBAT[depth] = BATdescriptor(idx); - BUNappend(TRANS, batname, FALSE); - } - tokenDepth = depth; - } - - GDKfree(batname); - return MAL_SUCCEED; -} - -str -TKNZRclose(int *r) -{ - int i; - (void) r; - - if (TRANS == NULL) - throw(MAL, "tokenizer", "no tokenizer store open"); - - TMsubcommit(TRANS); - - for (i = 0; i < tokenDepth; i++) { - BBPunfix(tokenBAT[i]->batCacheid); - } - BBPunfix(tokenBAT[INDEX]->batCacheid); - - tokenDepth = 0; - - BBPreclaim(TRANS); - TRANS = NULL; - return MAL_SUCCEED; -} - -/* - * Tokenize operations - * The tokenizer operation assumes a private copy to mark the end of the - * token separators with a zero byte. Tokens are separated by a single - * character for simplicity. Might be a good scheme to assume that - * strings to be broken are properly ended with either 0 or nl, not - * both. It seems 0 can be assumed. - */ -static int -TKNZRtokenize(str in, str *parts, char tkn) -{ - char *s, *t; - int depth = 0; - - s = in; - while (*s && *s != '\n') { - t = s; - while (*t != tkn && *t != '\n' && *t) - t++; - parts[depth++] = s; - s = t + (*t != 0); - *t = 0; - if (depth > MAX_TKNZR_DEPTH) - break; - } - return depth; -} - -str -TKNZRappend(oid *pos, str *s) -{ - str url; - str batname; - str parts[MAX_TKNZR_DEPTH]; - int i, new, r, depth; - BAT *b; - BUN p; - BUN idx = 0; - oid prv = 0; - oid comp; - - if (TRANS == NULL) - throw(MAL, "tokenizer", "no tokenizer store open"); - - if ((url = GDKstrdup(*s)) == NULL) { - throw(MAL, "tokenizer.append", OPERATION_FAILED MAL_MALLOC_FAIL); - } - - depth = TKNZRtokenize(url, parts, '/'); - new = depth; - - if (depth == 0) { - GDKfree(url); - return MAL_SUCCEED; - } - if (depth > MAX_TKNZR_DEPTH) { - GDKfree(url); - throw(MAL, "tokenizer", - ILLEGAL_ARGUMENT "input string breaks to too many parts"); - } - if (depth > tokenDepth || tokenBAT[0] == NULL) { - new = tokenDepth; - for (i = tokenDepth; i < depth; i++) { - /* make new bat */ - batname = (str) GDKmalloc(128 * sizeof(char)); - snprintf(batname, 128, "%s_%d", name, i); - b = BATnew(TYPE_oid, TYPE_str, 1024); - if (b == NULL) { - GDKfree(batname); - GDKfree(url); - throw(MAL, "tokenizer.append", MAL_MALLOC_FAIL); - } - BATkey(b, FALSE); - tokenBAT[i] = b; - - if (BKCsetName(&r, (int *) &(b->batCacheid), (str *) &batname) - != MAL_SUCCEED) { - GDKfree(batname); - GDKfree(url); - throw(MAL, "tokenizer.open", OPERATION_FAILED); - } - if (BKCsetPersistent(&r, (int *) &(b->batCacheid)) _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
-- | Stefan.Manegold@CWI.nl | DB Architectures (DA) | | www.CWI.nl/~manegold/ | Science Park 123 (L321) | | +31 (0)20 592-4212 | 1098 XG Amsterdam (NL) |
participants (1)
-
Stefan Manegold