Re: MonetDB: rdf - Generate sample data for all tables
Hi Duc,
I did not read in detail your function, but you can also use
BATsample_ (which is the void headed version of BATsample)
lefteris
On Wed, Feb 26, 2014 at 2:10 PM, Minh-Duc Pham
Changeset: e7109fc24610 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7109fc24610 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Generate sample data for all tables
diffs (truncated from 721 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -695,6 +695,11 @@ char isInfrequentSampleProp(CS freqCS, i if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; else return 0; } +static +char isInfrequentSampleCol(CS freqCS, PropTypes pt){ + if (pt.propFreq * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; + else return 0; +}
static void genCSPropTypesColIdx(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset){ @@ -4830,6 +4835,211 @@ void initSampleData(CSSample *csSample,B
} } + + +static +void getSubjIdFromTablePosition(int tblIdx, int pos, oid *sOid){ + oid id; + id = pos; + id |= (BUN)(tblIdx + 1) << (sizeof(BUN)*8 - NBITS_FOR_CSID); + *sOid = id; +} + +static +str getOrigSbt(oid *sbt, oid *origSbt, BAT *lmap, BAT *rmap){ + BUN pos; + oid *tmp; + pos = BUNfnd(BATmirror(rmap),sbt); + if (pos == BUN_NONE){ + throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded subject must be in rmap"); + } + tmp = (oid *) Tloc(lmap, pos); + if (*tmp == BUN_NONE){ + throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded subject must be in lmap"); + } + + *origSbt = *tmp; + + return MAL_SUCCEED; +} + +static +str getOrigObt(oid *obt, oid *origObt, BAT *lmap, BAT *rmap){ + BUN pos; + oid *tmp; + oid tmporigOid = BUN_NONE; + char objType; + BUN maxObjectURIOid = ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID - 1)) - 1; //Base on getTblIdxFromS + + objType = getObjType(*obt); + + if (objType == URI || objType == BLANKNODE){ + tmporigOid = (*obt) - ((oid)objType << (sizeof(BUN)*8 - 4)); + } + + if (tmporigOid > maxObjectURIOid){ + pos = BUNfnd(BATmirror(rmap),&tmporigOid); + if (pos == BUN_NONE){ + throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded object must be in rmap"); + } + tmp = (oid *) Tloc(lmap, pos); + if (*tmp == BUN_NONE){ + throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded object must be in lmap"); + } + + *origObt = *tmp; + } + else{ + *origObt = tmporigOid; + } + + return MAL_SUCCEED; +} + +static +str initFullSampleData(CSSampleExtend *csSampleEx, int *mTblIdxFreqIdxMapping, CSlabel *label, CStableStat* cstablestat, CSPropTypes *csPropTypes, CSset *freqCSset, int numTables, bat *lmapbatid, bat *rmapbatid){ + int i, j, k; + int freqId; + int tmpNumcand; + oid tmpCandidate; + int randValue = 0; + int ranPosition = 0; //random position of the instance in a table + int tmpNumCols; + int colIdx; + BAT *tmpbat = NULL; + BATiter tmpi; + BAT *cursamplebat = NULL; + int tmpNumRows = 0; + oid tmpSoid = BUN_NONE, origSoid = BUN_NONE; + oid origOid = BUN_NONE; + BAT *lmap = NULL, *rmap = NULL; + + if ((lmap = BATdescriptor(*lmapbatid)) == NULL) { + throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING); + } + + if ((rmap = BATdescriptor(*rmapbatid)) == NULL) { + BBPreleaseref(lmap->batCacheid); + throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING); + } + srand(123456); + for (i = 0; i < numTables; i++){ + freqId = mTblIdxFreqIdxMapping[i]; + csSampleEx[i].freqIdx = freqId; + tmpNumcand = (NUM_SAMPLE_CANDIDATE > label[freqId].candidatesCount)?label[freqId].candidatesCount:NUM_SAMPLE_CANDIDATE; + csSampleEx[i].name = cstablestat->lstcstable[i].tblname; + csSampleEx[i].candidateCount = tmpNumcand; + csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * tmpNumcand); + for (k = 0; k < tmpNumcand; k++){ + csSampleEx[i].candidates[k] = label[freqId].candidates[k]; + } + //Randomly exchange the value, change the position k with a random pos + for (k = 0; k < tmpNumcand; k++){ + randValue = rand() % tmpNumcand; + tmpCandidate = csSampleEx[i].candidates[k]; + csSampleEx[i].candidates[k] = csSampleEx[i].candidates[randValue]; + csSampleEx[i].candidates[randValue] = tmpCandidate; + } + + csSampleEx[i].lstSubjOid = (oid*)malloc(sizeof(oid) * NUM_SAMPLE_INSTANCE); + for (k = 0; k < NUM_SAMPLE_INSTANCE; k++) + csSampleEx[i].lstSubjOid[k] = BUN_NONE; + + tmpNumCols = csPropTypes[i].numProp - csPropTypes[i].numInfreqProp; //already remove infrequent column; + csSampleEx[i].numProp = tmpNumCols; + + assert(tmpNumCols > 0); + + csSampleEx[i].lstProp = (oid*)malloc(sizeof(oid) * tmpNumCols); + csSampleEx[i].lstIsInfrequentProp = (char*)malloc(sizeof(char) * tmpNumCols); + csSampleEx[i].lstIsMVCol = (char*)malloc(sizeof(char) * tmpNumCols); + csSampleEx[i].colBats = (BAT**)malloc(sizeof(BAT*) * tmpNumCols); + colIdx = -1; + csSampleEx[i].numInstances = 0; + for(j = 0; j < csPropTypes[i].numProp; j++){ + #if REMOVE_INFREQ_PROP + if (csPropTypes[i].lstPropTypes[j].defColIdx == -1) continue; //Infrequent prop + #endif + colIdx++; + csSampleEx[i].lstProp[colIdx] = csPropTypes[i].lstPropTypes[j].prop; + + csSampleEx[i].colBats[colIdx] = BATnew(TYPE_void, cstablestat->lstcstable[i].colBats[colIdx]->ttype , NUM_SAMPLE_INSTANCE + 1); + + //Mark whether this col is infrequent sample cols + if ( isInfrequentSampleCol(freqCSset->items[freqId], csPropTypes[i].lstPropTypes[j])){ + csSampleEx[i].lstIsInfrequentProp[colIdx] = 1; + } + else + csSampleEx[i].lstIsInfrequentProp[colIdx] = 0; + + //Mark whther this col is a MV col + csSampleEx[i].lstIsMVCol[colIdx] = csPropTypes[i].lstPropTypes[j].isMVProp; + + //if this is a multivalue column, get the data type of the first column + + } + assert(colIdx == (tmpNumCols - 1)); + + + // Inserting instances to csSampleEx + + tmpNumRows = BATcount(cstablestat->lstcstable[i].colBats[0]); + + for (k = 0; k < NUM_SAMPLE_INSTANCE; k++){ + ranPosition = rand() % tmpNumRows; + + getSubjIdFromTablePosition(i, ranPosition, &tmpSoid); + + if (getOrigSbt(&tmpSoid, &origSoid, lmap, rmap) != MAL_SUCCEED){ + throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal sbt "); + } + + csSampleEx[i].lstSubjOid[k] = origSoid; + + for (j = 0; j < tmpNumCols; j++){ + cursamplebat = csSampleEx[i].colBats[j]; + + tmpbat = cstablestat->lstcstable[i].colBats[j]; + tmpi = bat_iterator(tmpbat); + + if (tmpbat->ttype == TYPE_oid && csSampleEx[i].lstIsMVCol[j] == 0){ + //Get the original object oid + oid *tmpOid = (oid *) BUNtail(tmpi, ranPosition); + if(*tmpOid != oid_nil){ + if (getOrigObt(tmpOid, &origOid, lmap, rmap) != MAL_SUCCEED){ + throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal obt "); + } + BUNappend(cursamplebat, &origOid, TRUE); + } + else{ + BUNappend(cursamplebat, ATOMnilptr(TYPE_oid), TRUE); + } + + } + else + BUNappend(cursamplebat, BUNtail(tmpi, ranPosition), TRUE); + + + + } + csSampleEx[i].numInstances++; + } + + if (i == 0) + for (j = 0; j < tmpNumCols; j++){ + //BATprint(cstablestat->lstcstable[i].colBats[j]); + BATprint(csSampleEx[i].colBats[j]); + } + + } + + BBPunfix(lmap->batCacheid); + BBPunfix(rmap->batCacheid); + + return MAL_SUCCEED; + +} + static void freeSampleData(CSSample *csSample, int numCand){ int i, j; @@ -4846,6 +5056,25 @@ void freeSampleData(CSSample *csSample, free(csSample); }
+ +static +void freeSampleExData(CSSampleExtend *csSampleEx, int numCand){ + int i, j; + for (i = 0; i < numCand; i++){ + free(csSampleEx[i].lstProp); + free(csSampleEx[i].lstIsInfrequentProp); + free(csSampleEx[i].lstIsMVCol); + free(csSampleEx[i].candidates); + free(csSampleEx[i].lstSubjOid); + for (j = 0; j < csSampleEx[i].numProp; j++){ + BBPunfix(csSampleEx[i].colBats[j]->batCacheid); + } + free(csSampleEx[i].colBats); + } + + free(csSampleEx); +} + static void addSampleInstance(oid subj, oid *buffO, oid* buffP, int numP, int sampleIdx, CSSample *csSample){ int i,j; @@ -5217,6 +5446,295 @@ str printSampleData(CSSample *csSample, return MAL_SUCCEED; }
+#if 0 +static +str printFullSampleData(CSSampleExtend *csSampleEx, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){ + + int i,j, k; + FILE *fout, *fouttb, *foutis; + char filename[100], filename2[100], filename3[100]; + char tmpStr[20], tmpStr2[20], tmpStr3[20]; + int ret; + + str propStr; + str subjStr; + char* schema = "rdf"; + CSSample sample; + CS freqCS; + char objType = 0; + str objStr; + oid objOid = BUN_NONE; + BATiter mapi; + str canStr; + char isTitle = 0; + char isUrl = 0; + char isType = 0; + char isDescription = 0; + char isImage = 0; + char isSite = 0; + char isEmail = 0; + char isCountry = 0; + char isLocality = 0; + BAT *lmap = NULL, *rmap = NULL +#if USE_SHORT_NAMES + str propStrShort = NULL; + char *pch; +#endif + + + + mapi = bat_iterator(mbat); + + if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) { + throw(RDF, "rdf.rdfschema", + "could not open the tokenizer\n"); + } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
Thanks Lefteris,
I should look at that before implementing my own function -_-.
Best,
Minh-Duc
----- Original Message -----
From: "Lefteris"
Changeset: e7109fc24610 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7109fc24610 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Generate sample data for all tables
diffs (truncated from 721 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -695,6 +695,11 @@ char isInfrequentSampleProp(CS freqCS, i if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; else return 0; } +static +char isInfrequentSampleCol(CS freqCS, PropTypes pt){ + if (pt.propFreq * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; + else return 0; +}
static void genCSPropTypesColIdx(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset){ @@ -4830,6 +4835,211 @@ void initSampleData(CSSample *csSample,B
} } + + +static +void getSubjIdFromTablePosition(int tblIdx, int pos, oid *sOid){ + oid id; + id = pos; + id |= (BUN)(tblIdx + 1) << (sizeof(BUN)*8 - NBITS_FOR_CSID); + *sOid = id; +} + +static +str getOrigSbt(oid *sbt, oid *origSbt, BAT *lmap, BAT *rmap){ + BUN pos; + oid *tmp; + pos = BUNfnd(BATmirror(rmap),sbt); + if (pos == BUN_NONE){ + throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded subject must be in rmap"); + } + tmp = (oid *) Tloc(lmap, pos); + if (*tmp == BUN_NONE){ + throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded subject must be in lmap"); + } + + *origSbt = *tmp; + + return MAL_SUCCEED; +} + +static +str getOrigObt(oid *obt, oid *origObt, BAT *lmap, BAT *rmap){ + BUN pos; + oid *tmp; + oid tmporigOid = BUN_NONE; + char objType; + BUN maxObjectURIOid = ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID - 1)) - 1; //Base on getTblIdxFromS + + objType = getObjType(*obt); + + if (objType == URI || objType == BLANKNODE){ + tmporigOid = (*obt) - ((oid)objType << (sizeof(BUN)*8 - 4)); + } + + if (tmporigOid > maxObjectURIOid){ + pos = BUNfnd(BATmirror(rmap),&tmporigOid); + if (pos == BUN_NONE){ + throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded object must be in rmap"); + } + tmp = (oid *) Tloc(lmap, pos); + if (*tmp == BUN_NONE){ + throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded object must be in lmap"); + } + + *origObt = *tmp; + } + else{ + *origObt = tmporigOid; + } + + return MAL_SUCCEED; +} + +static +str initFullSampleData(CSSampleExtend *csSampleEx, int *mTblIdxFreqIdxMapping, CSlabel *label, CStableStat* cstablestat, CSPropTypes *csPropTypes, CSset *freqCSset, int numTables, bat *lmapbatid, bat *rmapbatid){ + int i, j, k; + int freqId; + int tmpNumcand; + oid tmpCandidate; + int randValue = 0; + int ranPosition = 0; //random position of the instance in a table + int tmpNumCols; + int colIdx; + BAT *tmpbat = NULL; + BATiter tmpi; + BAT *cursamplebat = NULL; + int tmpNumRows = 0; + oid tmpSoid = BUN_NONE, origSoid = BUN_NONE; + oid origOid = BUN_NONE; + BAT *lmap = NULL, *rmap = NULL; + + if ((lmap = BATdescriptor(*lmapbatid)) == NULL) { + throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING); + } + + if ((rmap = BATdescriptor(*rmapbatid)) == NULL) { + BBPreleaseref(lmap->batCacheid); + throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING); + } + srand(123456); + for (i = 0; i < numTables; i++){ + freqId = mTblIdxFreqIdxMapping[i]; + csSampleEx[i].freqIdx = freqId; + tmpNumcand = (NUM_SAMPLE_CANDIDATE > label[freqId].candidatesCount)?label[freqId].candidatesCount:NUM_SAMPLE_CANDIDATE; + csSampleEx[i].name = cstablestat->lstcstable[i].tblname; + csSampleEx[i].candidateCount = tmpNumcand; + csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * tmpNumcand); + for (k = 0; k < tmpNumcand; k++){ + csSampleEx[i].candidates[k] = label[freqId].candidates[k]; + } + //Randomly exchange the value, change the position k with a random pos + for (k = 0; k < tmpNumcand; k++){ + randValue = rand() % tmpNumcand; + tmpCandidate = csSampleEx[i].candidates[k]; + csSampleEx[i].candidates[k] = csSampleEx[i].candidates[randValue]; + csSampleEx[i].candidates[randValue] = tmpCandidate; + } + + csSampleEx[i].lstSubjOid = (oid*)malloc(sizeof(oid) * NUM_SAMPLE_INSTANCE); + for (k = 0; k < NUM_SAMPLE_INSTANCE; k++) + csSampleEx[i].lstSubjOid[k] = BUN_NONE; + + tmpNumCols = csPropTypes[i].numProp - csPropTypes[i].numInfreqProp; //already remove infrequent column; + csSampleEx[i].numProp = tmpNumCols; + + assert(tmpNumCols > 0); + + csSampleEx[i].lstProp = (oid*)malloc(sizeof(oid) * tmpNumCols); + csSampleEx[i].lstIsInfrequentProp = (char*)malloc(sizeof(char) * tmpNumCols); + csSampleEx[i].lstIsMVCol = (char*)malloc(sizeof(char) * tmpNumCols); + csSampleEx[i].colBats = (BAT**)malloc(sizeof(BAT*) * tmpNumCols); + colIdx = -1; + csSampleEx[i].numInstances = 0; + for(j = 0; j < csPropTypes[i].numProp; j++){ + #if REMOVE_INFREQ_PROP + if (csPropTypes[i].lstPropTypes[j].defColIdx == -1) continue; //Infrequent prop + #endif + colIdx++; + csSampleEx[i].lstProp[colIdx] = csPropTypes[i].lstPropTypes[j].prop; + + csSampleEx[i].colBats[colIdx] = BATnew(TYPE_void, cstablestat->lstcstable[i].colBats[colIdx]->ttype , NUM_SAMPLE_INSTANCE + 1); + + //Mark whether this col is infrequent sample cols + if ( isInfrequentSampleCol(freqCSset->items[freqId], csPropTypes[i].lstPropTypes[j])){ + csSampleEx[i].lstIsInfrequentProp[colIdx] = 1; + } + else + csSampleEx[i].lstIsInfrequentProp[colIdx] = 0; + + //Mark whther this col is a MV col + csSampleEx[i].lstIsMVCol[colIdx] = csPropTypes[i].lstPropTypes[j].isMVProp; + + //if this is a multivalue column, get the data type of the first column + + } + assert(colIdx == (tmpNumCols - 1)); + + + // Inserting instances to csSampleEx + + tmpNumRows = BATcount(cstablestat->lstcstable[i].colBats[0]); + + for (k = 0; k < NUM_SAMPLE_INSTANCE; k++){ + ranPosition = rand() % tmpNumRows; + + getSubjIdFromTablePosition(i, ranPosition, &tmpSoid); + + if (getOrigSbt(&tmpSoid, &origSoid, lmap, rmap) != MAL_SUCCEED){ + throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal sbt "); + } + + csSampleEx[i].lstSubjOid[k] = origSoid; + + for (j = 0; j < tmpNumCols; j++){ + cursamplebat = csSampleEx[i].colBats[j]; + + tmpbat = cstablestat->lstcstable[i].colBats[j]; + tmpi = bat_iterator(tmpbat); + + if (tmpbat->ttype == TYPE_oid && csSampleEx[i].lstIsMVCol[j] == 0){ + //Get the original object oid + oid *tmpOid = (oid *) BUNtail(tmpi, ranPosition); + if(*tmpOid != oid_nil){ + if (getOrigObt(tmpOid, &origOid, lmap, rmap) != MAL_SUCCEED){ + throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal obt "); + } + BUNappend(cursamplebat, &origOid, TRUE); + } + else{ + BUNappend(cursamplebat, ATOMnilptr(TYPE_oid), TRUE); + } + + } + else + BUNappend(cursamplebat, BUNtail(tmpi, ranPosition), TRUE); + + + + } + csSampleEx[i].numInstances++; + } + + if (i == 0) + for (j = 0; j < tmpNumCols; j++){ + //BATprint(cstablestat->lstcstable[i].colBats[j]); + BATprint(csSampleEx[i].colBats[j]); + } + + } + + BBPunfix(lmap->batCacheid); + BBPunfix(rmap->batCacheid); + + return MAL_SUCCEED; + +} + static void freeSampleData(CSSample *csSample, int numCand){ int i, j; @@ -4846,6 +5056,25 @@ void freeSampleData(CSSample *csSample, free(csSample); }
+ +static +void freeSampleExData(CSSampleExtend *csSampleEx, int numCand){ + int i, j; + for (i = 0; i < numCand; i++){ + free(csSampleEx[i].lstProp); + free(csSampleEx[i].lstIsInfrequentProp); + free(csSampleEx[i].lstIsMVCol); + free(csSampleEx[i].candidates); + free(csSampleEx[i].lstSubjOid); + for (j = 0; j < csSampleEx[i].numProp; j++){ + BBPunfix(csSampleEx[i].colBats[j]->batCacheid); + } + free(csSampleEx[i].colBats); + } + + free(csSampleEx); +} + static void addSampleInstance(oid subj, oid *buffO, oid* buffP, int numP, int sampleIdx, CSSample *csSample){ int i,j; @@ -5217,6 +5446,295 @@ str printSampleData(CSSample *csSample, return MAL_SUCCEED; }
+#if 0 +static +str printFullSampleData(CSSampleExtend *csSampleEx, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){ + + int i,j, k; + FILE *fout, *fouttb, *foutis; + char filename[100], filename2[100], filename3[100]; + char tmpStr[20], tmpStr2[20], tmpStr3[20]; + int ret; + + str propStr; + str subjStr; + char* schema = "rdf"; + CSSample sample; + CS freqCS; + char objType = 0; + str objStr; + oid objOid = BUN_NONE; + BATiter mapi; + str canStr; + char isTitle = 0; + char isUrl = 0; + char isType = 0; + char isDescription = 0; + char isImage = 0; + char isSite = 0; + char isEmail = 0; + char isCountry = 0; + char isLocality = 0; + BAT *lmap = NULL, *rmap = NULL +#if USE_SHORT_NAMES + str propStrShort = NULL; + char *pch; +#endif + + + + mapi = bat_iterator(mbat); + + if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) { + throw(RDF, "rdf.rdfschema", + "could not open the tokenizer\n"); + } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
developers-list mailing list developers-list@monetdb.org https://www.monetdb.org/mailman/listinfo/developers-list
participants (2)
-
Lefteris
-
Pham Duc