On Fri, Feb 19, 2010 at 03:40:10PM +0000, Peter Boncz wrote:
Update of /cvsroot/monetdb/MonetDB/src/gdk In directory sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv10221
Modified Files: Tag: Feb2010 gdk_posix.mx gdk_storage.mx Log Message: - fix newly introduced bug (BUF_SEQUENTIAL passed to posix_madvise iso MMAP_SEQUENTIAL) most of this was done by Stefan, but the intent of the code was when all users are gone, to give uniform (sequential) advise to the whole heap - make sure the length of advise is page aligned (not strictly needed for Linux, but maybe of other Un*ces) - in case of shared vheaps (as done by leftfetchjoin into a string bat), assume that access to the string heap will be random.
shouldn't len & ~MT_pagesize() then rather be len & ~(MT_pagesize()-1) (assuming pages size is usually a power of 2) ? Stefan
Index: gdk_storage.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_storage.mx,v retrieving revision 1.149.2.36 retrieving revision 1.149.2.37 diff -u -d -r1.149.2.36 -r1.149.2.37 --- gdk_storage.mx 19 Feb 2010 13:13:38 -0000 1.149.2.36 +++ gdk_storage.mx 19 Feb 2010 15:40:07 -0000 1.149.2.37 @@ -707,17 +707,19 @@ * Peter Feb2010: I tried to do prefetches further apart, to trigger multiple readahead * units in parallel, but it does improve performance visibly */ -static size_t access_heap(str id, str hp, Heap *h, char* base, size_t sz, int touch, int preload, int advise) { +static size_t access_heap(str id, str hp, Heap *h, char* base, size_t sz, int touch, int preload, int adv) { size_t v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 =0, v6 = 0, v7 = 0, page = MT_pagesize(); + str advise = (adv==MMAP_WILLNEED)?"WILLNEED":(adv==MMAP_SEQUENTIAL)?"SEQUENTIAL":(adv==MMAP_RANDOM)?"RANDOM":(adv==MMAP_NORMAL)?"NORMAL":NULL; int t = GDKms(); + assert(advise); if (h->storage != STORE_MEM && h->size > MT_MMAP_TILE) { - MT_mmap_inform(h->base, h->size, preload, advise, 0); + MT_mmap_inform(h->base, h->size, preload, adv, 0); if (preload > 0) { - void* alignedbase = (void*) (((size_t) base) & ~(page-1)); - size_t alignedsz = (sz + (page-1)) & ~(page-1); - int ret = posix_madvise(alignedbase, sz, advise); - if (ret) THRprintf(GDKerr, "#MT_mmap_inform: posix_madvise(file=%s, base="PTRFMT", len="SZFMT"MB, advice=%d) = %d\n", - h->filename, PTRFMTCAST alignedbase, alignedsz >> 20, advise, errno); + size_t alignskip = (page - (((size_t) base) & (page-1))) & (page-1); + size_t alignedsz = (size_t) (((sz < alignskip)?0:((size_t) (sz-alignskip))) & ~(page-1)); + int ret = posix_madvise(base + alignskip, alignedsz, adv); + if (ret) THRprintf(GDKerr, "#MT_mmap_inform: posix_madvise(file=%s, base="PTRFMT", len="SZFMT"MB, advice=%s) = %d\n", + h->filename, PTRFMTCAST (base + alignskip), alignedsz >> 20, advise, errno); } } if (touch && preload > 0) { @@ -731,8 +733,7 @@ } for (hi += 7*page; lo <= hi; lo +=page) v0 += *lo; } - IODEBUG THRprintf(GDKout,"#BATpreload(%s->%s,preload=%d,sz=%dMB,%s) = %dms \n", id, hp, preload, (int) (sz>>20), - (advise==MMAP_WILLNEED)?"WILLNEED":(advise==MMAP_SEQUENTIAL)?"SEQUENTIAL":"UNKNOWN", GDKms()-t); + IODEBUG THRprintf(GDKout,"#BATpreload(%s->%s,preload=%d,sz=%dMB,%s) = %dms \n", id, hp, preload, (int) (sz>>20), advise, GDKms()-t); return v0+v1+v2+v3+v4+v5+v6+v7; }
@@ -743,7 +744,6 @@ str id = BATgetId(b); BATiter bi = bat_iterator(b);
- assert(advise==MMAP_NORMAL||advise==MMAP_RANDOM||advise==MMAP_SEQUENTIAL||advise==MMAP_WILLNEED||advise==MMAP_DONTNEED); if (BATcount(b) == 0) return 0;
/* HASH indices (inherent random access). handle first as they *will* be access randomly (one can always hope for locality on the other heaps) */ @@ -760,30 +760,46 @@ gdk_unset_lock(GDKhashLock(ABS(b->batCacheid) & BBP_BATMASK), "BATaccess"); }
- /* we only touch stuff that is going to be read randomly (WILLNEED). Note varheaps are sequential wrt to the references, or small */ - if ( what&USE_HEAD) { + /* vheaps next, as shared vheaps are not seq-correlated needing WILLNEED (use prefetch budget for this first) */ + if ( what&USE_HEAD ) { + if (b->H->vheap && b->H->vheap->base) { + char *lo = BUNhead(bi, BUNfirst(b)), *hi = BUNhead(bi, BUNlast(b)-1); + int heap_advise = advise; + if (b->H->vheap->copied) { /* shared string heaps are not (likely) to be sequentially correlated */ + lo = b->H->vheap->base; hi = lo + b->H->vheap->free; + heap_advise = MADV_WILLNEED; + } + budget -= sz = ((hi-lo) > budget)?budget:(hi-lo); + v += access_heap(id, "hheap", b->H->vheap, lo, sz, (advise == BUF_WILLNEED), preload, heap_advise); + } + } + if ( what&USE_TAIL ) { + if (b->T->vheap && b->T->vheap->base) { + char *lo = BUNtail(bi, BUNfirst(b)), *hi = BUNtail(bi, BUNlast(b)-1); + int heap_advise = advise; + if (b->T->vheap->copied) { /* shared string heaps are not (likely) to be sequentially correlated */ + lo = b->T->vheap->base; hi = lo + b->T->vheap->free; + heap_advise = MADV_WILLNEED; + } + budget -= sz = ((hi-lo) > budget)?budget:(hi-lo); + v += access_heap(id, "theap", b->T->vheap, lo, sz, (advise == BUF_WILLNEED), preload, heap_advise); + } + } + + /* BUN heaps are last in line for prefetch budget */ + if ( what&USE_HEAD ) { if (b->H->heap.base) { char *lo = BUNhloc(bi, BUNfirst(b)), *hi = BUNhloc(bi, BUNlast(b)-1); budget -= sz = ((hi-lo) > budget)?budget:(hi-lo); v += access_heap(id, "hbuns", &b->H->heap, lo, sz, (advise == MMAP_WILLNEED), preload, advise); } - if (b->H->vheap && b->H->vheap->base) { - char *lo = BUNhead(bi, BUNfirst(b)), *hi = BUNhead(bi, BUNlast(b)-1); - budget -= sz = ((hi-lo) > budget)?budget:(hi-lo); - v += access_heap(id, "hheap", b->H->vheap, lo, sz, (advise == MMAP_WILLNEED), preload, advise); - } } - if ( what&USE_TAIL) { + if ( what&USE_TAIL ) { if (b->T->heap.base) { char *lo = BUNtloc(bi, BUNfirst(b)), *hi = BUNtloc(bi, BUNlast(b)-1); budget -= sz = ((hi-lo) > budget)?budget:(hi-lo); v += access_heap(id, "tbuns", &b->T->heap, lo, sz, (advise == MMAP_WILLNEED), preload, advise); } - if (b->T->vheap && b->T->vheap->base) { - char *lo = BUNtail(bi, BUNfirst(b)), *hi = BUNtail(bi, BUNlast(b)-1); - budget -= sz = ((hi-lo) > budget)?budget:(hi-lo); - v += access_heap(id, "theap", b->T->vheap, lo, sz, (advise == MMAP_WILLNEED), preload, advise); - } } return v; }
Index: gdk_posix.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_posix.mx,v retrieving revision 1.176.2.24 retrieving revision 1.176.2.25 diff -u -d -r1.176.2.24 -r1.176.2.25 --- gdk_posix.mx 19 Feb 2010 13:13:38 -0000 1.176.2.24 +++ gdk_posix.mx 19 Feb 2010 15:40:07 -0000 1.176.2.25 @@ -675,7 +675,7 @@ #ifdef HAVE_POSIX_FADVISE if (!do_not_use_posix_fadvise && MT_mmap_tab[victim].fd >= 0) { /* tell the OS quite clearly that you want to drop this */ - ret = posix_fadvise(MT_mmap_tab[victim].fd, 0LL, MT_mmap_tab[victim].len, POSIX_FADV_DONTNEED); + ret = posix_fadvise(MT_mmap_tab[victim].fd, 0LL, MT_mmap_tab[victim].len & ~MT_pagesize(), POSIX_FADV_DONTNEED); #ifdef MMAP_DEBUG stream_printf(GDKerr, "#MT_mmap_del: posix_fadvise(%s,fd=%d,%uMB,POSIX_FADV_DONTNEED) = %d\n", MT_mmap_tab[victim].path, MT_mmap_tab[victim].fd, (unsigned int) (MT_mmap_tab[victim].len >> 20), ret); #endif @@ -709,7 +709,7 @@ i = MT_mmap_idx(base, len); if (i >= 0) { if (MT_mmap_tab[i].fd >= 0) { - ret = posix_fadvise(MT_mmap_tab[i].fd, 0, len, advice); + ret = posix_fadvise(MT_mmap_tab[i].fd, 0, len & ~MT_pagesize(), advice); #ifdef MMAP_DEBUG stream_printf(GDKerr, "#MT_fadvise: posix_fadvise(%s,fd=%d,%uMB,%d) = %d\n", MT_mmap_tab[i].path, MT_mmap_tab[i].fd, (unsigned int) (len >> 20), advice, ret); #endif @@ -733,7 +733,7 @@ { size_t len = MIN((size_t) MT_MMAP_TILE, MT_mmap_tab[i].len - off); /* tell Linux to please stop caching this stuff */ - int ret = posix_madvise(MT_mmap_tab[i].base + off, len, POSIX_MADV_DONTNEED); + int ret = posix_madvise(MT_mmap_tab[i].base + off, len & ~MT_pagesize(), POSIX_MADV_DONTNEED);
if (err) { stream_printf(err, "#MT_mmap_unload_tile: posix_madvise(%s,off=%uMB,%uMB,fd=%d,POSIX_MADV_DONTNEED) = %d\n", @@ -743,7 +743,7 @@ #ifdef HAVE_POSIX_FADVISE if (!do_not_use_posix_fadvise) { /* tell the OS quite clearly that you want to drop this */ - ret = posix_fadvise(MT_mmap_tab[i].fd, off, len, POSIX_FADV_DONTNEED); + ret = posix_fadvise(MT_mmap_tab[i].fd, off, len & ~MT_pagesize(), POSIX_FADV_DONTNEED); if (err) { stream_printf(err, "#MT_mmap_unload_tile: posix_fadvise(%s,off=%uMB,%uMB,fd=%d,POSIX_MADV_DONTNEED) = %d\n", MT_mmap_tab[i].path, (unsigned int) (off >> 20), @@ -908,10 +908,9 @@ MT_mmap_tab[i].random += preload * (advise == MMAP_WILLNEED); /* done as a counter to keep track of multiple threads */ MT_mmap_tab[i].usecnt += preload; /* active thread count */ unload = MT_mmap_tab[i].usecnt == 0; + if (unload) ret = posix_madvise(MT_mmap_tab[i].base, MT_mmap_tab[i].len & ~MT_pagesize(), MMAP_SEQUENTIAL); } (void) pthread_mutex_unlock(&MT_mmap_lock); - if (unload) - ret = posix_madvise(base, len, MMAP_SEQUENTIAL); if (ret) { stream_printf(GDKerr, "#MT_mmap_inform: posix_madvise(file=%s, fd=%d, base="PTRFMT", len="SZFMT"MB, advice=%d) = %d\n", (i >= 0 ? MT_mmap_tab[i].path : ""), (i >= 0 ? MT_mmap_tab[i].fd : -1), @@ -1156,7 +1155,7 @@
if (ret != (void *) -1L) { if (hdl->mode & MMAP_ADVISE) { - (void) MT_madvise(ret, len, hdl->mode & MMAP_ADVISE); + (void) MT_madvise(ret, len & ~MT_pagesize(), hdl->mode & MMAP_ADVISE); } hdl->fixed = (void *) ((char *) ret + len); } @@ -1199,7 +1198,7 @@ int MT_madvise(void *p, size_t len, int advise) { - int ret = posix_madvise(p, len, advise); + int ret = posix_madvise(p, len & ~MT_pagesize(), advise);
#ifdef MMAP_DEBUG stream_printf(GDKerr, "#posix_madvise(" PTRFMT "," SZFMT ",%d) = %d\n", PTRFMTCAST p, len, advise, ret);
------------------------------------------------------------------------------ Download Intel® Parallel Studio Eval Try the new software tools for yourself. Speed compiling, find bugs proactively, and fine-tune applications for parallel performance. See why Intel Parallel Studio got high marks during beta. http://p.sf.net/sfu/intel-sw-dev _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
-- | Dr. Stefan Manegold | mailto:Stefan.Manegold@cwi.nl | | CWI, P.O.Box 94079 | http://www.cwi.nl/~manegold/ | | 1090 GB Amsterdam | Tel.: +31 (20) 592-4212 | | The Netherlands | Fax : +31 (20) 592-4199 |