LCOV - code coverage report
Current view: top level - gdk - gdk_posix.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 47 100 47.0 %
Date: 2024-10-04 20:04:04 Functions: 7 9 77.8 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  * @a Niels Nes, Peter Boncz
      15             :  * @* System Independent Layer
      16             :  *
      17             :  * GDK is built on Posix. Exceptions are made for memory mapped files
      18             :  * and anonymous virtual memory, for which somewhat higher-level
      19             :  * functions are defined here.  Most of this file concerns itself with
      20             :  * emulation of Posix functionality on the WIN32 native platform.
      21             :  */
      22             : #include "monetdb_config.h"
      23             : #include "gdk.h"              /* includes gdk_posix.h */
      24             : #include "gdk_private.h"
      25             : #include "mutils.h"
      26             : #include <unistd.h>
      27             : #include <string.h>     /* strncpy */
      28             : 
      29             : #ifdef HAVE_FCNTL_H
      30             : # include <fcntl.h>
      31             : #endif
      32             : #ifdef HAVE_PROCFS_H
      33             : # include <procfs.h>
      34             : #endif
      35             : #ifdef HAVE_MACH_TASK_H
      36             : # include <mach/task.h>
      37             : #endif
      38             : #ifdef HAVE_MACH_MACH_INIT_H
      39             : # include <mach/mach_init.h>
      40             : #endif
      41             : #if defined(HAVE_KVM_H)
      42             : # include <kvm.h>
      43             : # include <sys/param.h>
      44             : # include <sys/sysctl.h>
      45             : # include <sys/user.h>
      46             : #endif
      47             : 
      48             : #if defined(__GNUC__) && defined(HAVE_VALGRIND)
      49             : #include <valgrind.h>
      50             : #else
      51             : #define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)
      52             : #define VALGRIND_FREELIKE_BLOCK(addr, rzB)
      53             : #define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)
      54             : #endif
      55             : 
      56             : #ifndef MAP_NORESERVE
      57             : # define MAP_NORESERVE          MAP_PRIVATE
      58             : #endif
      59             : #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
      60             : #define MAP_ANONYMOUS           MAP_ANON
      61             : #endif
      62             : 
      63             : #define MMAP_ADVISE             7
      64             : #define MMAP_WRITABLE           (MMAP_WRITE|MMAP_COPY)
      65             : 
      66             : #ifndef O_CLOEXEC
      67             : #ifdef _O_NOINHERIT
      68             : #define O_CLOEXEC _O_NOINHERIT  /* Windows */
      69             : #else
      70             : #define O_CLOEXEC 0
      71             : #endif
      72             : #endif
      73             : 
      74             : /* Crude VM buffer management that keep a list of all memory mapped
      75             :  * regions.
      76             :  *
      77             :  * a.k.a. "helping stupid VM implementations that ignore VM advice"
      78             :  *
      79             :  * The main goal is to be able to tell the OS to please stop buffering
      80             :  * all memory mapped pages when under pressure. A major problem is
      81             :  * materialization of large results in newly created memory mapped
      82             :  * files. Operating systems tend to cache all dirty pages, such that
      83             :  * when memory is out, all pages are dirty and cannot be unloaded
      84             :  * quickly. The VM panic occurs and comatose OS states may be
      85             :  * observed.  This is in spite of our use of
      86             :  * madvise(MADV_SEQUENTIAL). That is; we would want that the OS drops
      87             :  * pages after we've passed them. That does not happen; pages are
      88             :  * retained and pollute the buffer cache.
      89             :  *
      90             :  * Regrettably, at this level, we don't know anything about how Monet
      91             :  * is using the mmapped regions. Monet code is totally oblivious of
      92             :  * any I/O; that's why it is so easy to create CPU efficient code in
      93             :  * Monet.
      94             :  *
      95             :  * The current solution focuses on large writable maps. These often
      96             :  * represent newly created BATs, that are the result of some (running)
      97             :  * operator. We assume two things here:
      98             :  * - the BAT is created in sequential fashion (always almost true)
      99             :  * - afterwards, this BAT is used in sequential fashion (often true)
     100             :  *
     101             :  * A VMtrim thread keeps an eye on the RSS (memory pressure) and large
     102             :  * writable memory maps. If RSS approaches mem_maxsize(), it starts to
     103             :  * *worry*, and starts to write dirty data from these writable maps to
     104             :  * disk in 128MB tiles. So, if memory pressure rises further in the
     105             :  * near future, the OS has some option to release memory pages cheaply
     106             :  * (i.e. without needing I/O). This is also done explicitly by the
     107             :  * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS
     108             :  * to release pages.  The reason is that Linux is not smart enough to
     109             :  * do even this. Anyway..
     110             :  *
     111             :  * The way to free pages explicitly in Linux is to call
     112             :  * posix_fadvise(..,MADV_DONTNEED).  Particularly,
     113             :  * posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and
     114             :  * documented doesn't work on Linux. But we do both posix_madvise and
     115             :  * posix_fadvise, so on other unix systems that don't support
     116             :  * posix_fadvise, posix_madvise still might work.  On Windows, to our
     117             :  * knowledge, there is no way to tell it stop buffering a memory
     118             :  * mapped region. msync (FlushViewOfFile) does work, though. So let's
     119             :  * hope the VM paging algorithm behaves better than Linux which just
     120             :  * runs off the cliff and if MonetDB does not prevent RSS from being
     121             :  * too high, enters coma.
     122             :  *
     123             :  * We will only be able to sensibly test this on Windows64. On
     124             :  * Windows32, mmap sizes do not significantly exceed RAM sizes so
     125             :  * MonetDB swapping actually will not happen (of course, you've got
     126             :  * this nasty problem of VM fragemntation and failing mmaps instead).
     127             :  *
     128             :  * In principle, page tiles are saved sequentially, and behind it, but
     129             :  * never overtaking it, is an "unload-cursor" that frees the pages if
     130             :  * that is needed to keep RSS down.  There is a tweak in the
     131             :  * algorithm, that re-sets the unload-cursor if it seems that all
     132             :  * tiles to the end have been saved (whether a tile is actually saved
     133             :  * is determined by timing the sync action). This means that the
     134             :  * producing operator is ready creating the BAT, and we assume it is
     135             :  * going to be used sequentially afterwards.  In that case, we should
     136             :  * start unloading right after the 'read-cursor', that is, from the
     137             :  * start.
     138             :  *
     139             :  * EXAMPLE
     140             :  * D = dirty tile
     141             :  * s = saved tile (i.e. clean)
     142             :  * u = unloaded tile
     143             :  * L = tile that is being loaded
     144             :  *
     145             :  *           +--> operator produces  BAT
     146             :  * (1) DDDDDD|......................................| end of reserved mmap
     147             :  *                      ____|RSS
     148             :  *                     |
     149             :  *                     | at 3/4 of RSS consumed we start to worry
     150             :  *                     +--> operator produces BAT
     151             :  * (2) DDDDDDDDDDDDDDDD|............................|
     152             :  *                    s<----------------------------- VM backwards save thread
     153             :  *                    |
     154             :  *                    + first tile of which saving costs anything
     155             :  *
     156             :  *                        +--> operator produces BAT
     157             :  * (3) DDDDDDDDDDDDDDDss|D|.........................|
     158             :  *     VM-thread save ->|
     159             :  *
     160             :  * When the RSS target is exceeded, we start unloading tiles..
     161             :  *
     162             :  *                     +-->  VM-thread unload starts at *second* 's'
     163             :  *                     |
     164             :  *                     |    +--> operator produces BAT
     165             :  * (4) DDDDDDDDDDDDDDDsus|DD|........................|
     166             :  *     VM-thread save -->|  | RSS = Full!
     167             :  *
     168             :  *                                  +-- 0 => save costs nothing!!
     169             :  *     VM-thread save ------------->|        assume bat complete
     170             :  * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
     171             :  *                    |<-------- re-set unload cursor
     172             :  *                    +--- first tile was not unloaded.
     173             :  *
     174             :  * later.. some other operator sequentially reads the bat
     175             :  * first part is 'D', that is, nicely cached.
     176             :  *
     177             :  *     ---read------->|
     178             :  * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
     179             :  *
     180             :  * now we're hitting the unloaded region. the query becomes
     181             :  * I/O read bound here (typically 20% CPU utilization).
     182             :  *
     183             :  *     ---read-------->|
     184             :  * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................|
     185             :  *                   /  \
     186             :  *      unload cursor    load cursor
     187             :  *
     188             :  *     ---read---------------->|
     189             :  * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
     190             :  *                           /  \
     191             :  *              unload cursor    load cursor
     192             :  *
     193             :  *     ---read--------------------->| done
     194             :  * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
     195             :  *                              ****
     196             :  *                              last part still cached
     197             :  *
     198             :  * note: if we would not have re-setted the unload cursor (5)
     199             :  *       the last part would have been lost due to continuing
     200             :  *       RSS pressure from the 'L' read-cursor.
     201             :  *
     202             :  * If multiple write-mmaps exist, we do unload-tile and save-tile
     203             :  * selection on a round-robin basis among them.
     204             :  *
     205             :  * Of course, this is a simple solution for simple cases only.
     206             :  * (a) if the bat is produced too fast, (or your disk is too slow)
     207             :  *     RSS will exceeds its limit and Linux will go into swapping.
     208             :  * (b) if your data is not produced and read sequentially.
     209             :  *     Examples are sorting or clustering on huge datasets.
     210             :  * (c) if RSS pressure is due to large read-maps, rather than
     211             :  *     intermediate results.
     212             :  *
     213             :  * Two crude suggestions:
     214             :  * - If we are under RSS pressure without unloadable tiles and with
     215             :  *   saveable tiles, we should consider suspending *all* other threads
     216             :  *   until we manage to unload a tile.
     217             :  * - if there are no saveable tiles (or in case of read-only maps)
     218             :  *   we could resort to saving and unloading random tiles.
     219             :  *
     220             :  * To do better, our BAT algorithms should provide even more detailed
     221             :  * advice on their access patterns, which may even consist of pointers
     222             :  * to the cursors (i.e. pointers to b->batBuns->free or the cursors
     223             :  * in radix-cluster), which an enhanced version of this thread might
     224             :  * take into account.
     225             :  *
     226             :  * [Kersten] The memory map table should be aligned to the number of
     227             :  * mapped files. In more recent applications, such as the SkyServer
     228             :  * this may be around 2000 BATs easily.
     229             :  */
     230             : 
     231             : #ifdef HAVE_PTHREAD_H
     232             : /* pthread.h on Windows includes config.h if HAVE_CONFIG_H is set */
     233             : #undef HAVE_CONFIG_H
     234             : #include <sched.h>
     235             : #include <pthread.h>
     236             : #endif
     237             : #ifdef HAVE_SEMAPHORE_H
     238             : #include <semaphore.h>
     239             : #endif
     240             : 
     241             : #ifndef NATIVE_WIN32
     242             : #ifdef HAVE_POSIX_FADVISE
     243             : #ifdef HAVE_UNAME
     244             : #include <sys/utsname.h>
     245             : #endif
     246             : #endif
     247             : 
     248             : void
     249         332 : MT_init_posix(void)
     250             : {
     251         332 : }
     252             : 
     253             : /* return RSS in bytes */
     254             : size_t
     255           0 : MT_getrss(void)
     256             : {
     257             : #if defined(HAVE_PROCFS_H) && defined(__sun__)
     258             :         /* retrieve RSS the Solaris way (2.6+) */
     259             :         int fd;
     260             :         psinfo_t psbuff;
     261             : 
     262             :         fd = open("/proc/self/psinfo", O_RDONLY | O_CLOEXEC);
     263             :         if (fd >= 0) {
     264             :                 if (read(fd, &psbuff, sizeof(psbuff)) == sizeof(psbuff)) {
     265             :                         close(fd);
     266             :                         return psbuff.pr_rssize * 1024;
     267             :                 }
     268             :                 close(fd);
     269             :         }
     270             : #elif defined(HAVE_TASK_INFO)
     271             :         /* Darwin/MACH call for process' RSS */
     272             :         task_t task = mach_task_self();
     273             :         struct task_basic_info_64 t_info;
     274             :         mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_64_COUNT;
     275             : 
     276             :         if (task_info(task, TASK_BASIC_INFO_64, (task_info_t)&t_info, &t_info_count) != KERN_INVALID_POLICY)
     277             :                 return t_info.resident_size;  /* bytes */
     278             : #elif defined(HAVE_KVM_H)
     279             :         /* get RSS on FreeBSD and NetBSD */
     280             :         struct kinfo_proc *ki;
     281             :         int ski = 1;
     282             :         kvm_t *kd;
     283             :         size_t rss = 0;
     284             : 
     285             :         kd = kvm_open(NULL, "/dev/null", NULL, O_RDONLY, "kvm_open");
     286             :         if (kd != NULL) {
     287             :                 ki = kvm_getprocs(kd, KERN_PROC_PID, getpid(), &ski);
     288             :                 if (ki != NULL) {
     289             : #ifdef __NetBSD__               /* should we use configure for this? */
     290             :                         /* see bug 3217 */
     291             :                         rss = ki->kp_eproc.e_vm.vm_rssize;
     292             : #else
     293             :                         rss = ki->ki_rssize;
     294             : #endif
     295             :                         kvm_close(kd);
     296             : 
     297             :                         return rss * MT_pagesize();
     298             :                 } else {
     299             :                         kvm_close(kd);
     300             :                 }
     301             :         }
     302             : #elif defined(__linux__)
     303             :         /* get RSS on Linux */
     304           0 :         int fd;
     305             : 
     306           0 :         fd = open("/proc/self/statm", O_RDONLY | O_CLOEXEC);
     307           0 :         if (fd >= 0) {
     308           0 :                 char buf[1024];
     309           0 :                 ssize_t sz = read(fd, buf, sizeof(buf) - 1);
     310             : 
     311           0 :                 close(fd);
     312           0 :                 if (sz > 0) {
     313           0 :                         buf[sz] = 0;
     314           0 :                         long rss;
     315           0 :                         if (sscanf(buf, "%*d %ld", &rss) >= 1)
     316           0 :                                 return (size_t) rss * MT_pagesize();
     317             :                 }
     318             :         }
     319             : #endif
     320             :         return 0;
     321             : }
     322             : 
     323             : void *
     324        2669 : MT_mmap(const char *path, int mode, size_t len)
     325             : {
     326        2669 :         int fd;
     327        2669 :         void *ret;
     328             : 
     329        2669 :         fd = open(path, O_CREAT | ((mode & MMAP_WRITE) ? O_RDWR : O_RDONLY) | O_CLOEXEC, MONETDB_MODE);
     330        2669 :         if (fd < 0) {
     331           0 :                 GDKsyserror("open %s failed\n", path);
     332           0 :                 return NULL;
     333             :         }
     334        2669 :         ret = mmap(NULL,
     335             :                    len,
     336        2669 :                    ((mode & MMAP_WRITABLE) ? PROT_WRITE : 0) | PROT_READ,
     337        2669 :                    (mode & MMAP_COPY) ? (MAP_PRIVATE | MAP_NORESERVE) : MAP_SHARED,
     338             :                    fd,
     339             :                    0);
     340        2669 :         if (ret == MAP_FAILED) {
     341           0 :                 GDKsyserror("mmap(%s,%zu) failed\n", path, len);
     342           0 :                 ret = NULL;
     343             :         } else {
     344        2669 :                 VALGRIND_MALLOCLIKE_BLOCK(ret, len, 0, 1);
     345             :         }
     346        2669 :         close(fd);
     347        2669 :         return ret;
     348             : }
     349             : 
     350             : int
     351        2669 : MT_munmap(void *p, size_t len)
     352             : {
     353        2669 :         int ret = munmap(p, len);
     354             : 
     355        2669 :         if (ret < 0)
     356           0 :                 GDKsyserror("munmap(%p,%zu) failed\n", p, len);
     357        2669 :         VALGRIND_FREELIKE_BLOCK(p, 0);
     358        2669 :         return ret;
     359             : }
     360             : 
     361             : /* expand or shrink a memory map (ala realloc).
     362             :  * the address returned may be different from the address going in.
     363             :  * in case of failure, the old address is still mapped and NULL is returned.
     364             :  */
     365             : void *
     366         561 : MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
     367             : {
     368         561 :         void *p;
     369         561 :         int fd = -1;
     370         561 :         int flags = mode & MMAP_COPY ? MAP_PRIVATE : MAP_SHARED;
     371         561 :         int prot = PROT_WRITE | PROT_READ;
     372             : 
     373             : #ifdef MAP_FIXED_NOREPLACE
     374         561 :         flags |= MAP_FIXED_NOREPLACE;
     375             : #endif
     376             :         /* round up to multiple of page size */
     377         561 :         *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
     378             : 
     379             :         /* doesn't make sense for us to extend read-only memory map */
     380         561 :         assert(mode & MMAP_WRITABLE);
     381             : 
     382         561 :         if (*new_size < old_size) {
     383             : #ifndef __COVERITY__    /* hide this from static code analyzer */
     384             :                 /* shrink */
     385           0 :                 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
     386           0 :                 if (munmap((char *) old_address + *new_size,
     387             :                            old_size - *new_size) < 0) {
     388           0 :                         GDKsyserror("MT_mremap(%s,%p,%zu,%zu): munmap() failed\n", path?path:"NULL", old_address, old_size, *new_size);
     389             :                         /* even though the system call failed, we
     390             :                          * don't need to propagate the error up: the
     391             :                          * address should still work in the same way
     392             :                          * as it did before */
     393           0 :                         return old_address;
     394             :                 }
     395           0 :                 if (path && truncate(path, *new_size) < 0)
     396           0 :                         GDKwarning("truncate of %s failed: %s\n",
     397             :                                     path, GDKstrerror(errno, (char[64]){0}, 64));
     398             : #endif  /* !__COVERITY__ */
     399           0 :                 return old_address;
     400             :         }
     401         561 :         if (*new_size == old_size) {
     402             :                 /* do nothing */
     403             :                 return old_address;
     404             :         }
     405             : 
     406         561 :         if (!(mode & MMAP_COPY) && path != NULL) {
     407             :                 /* "normal" memory map */
     408             : 
     409         561 :                 if ((fd = open(path, O_RDWR | O_CLOEXEC)) < 0) {
     410           0 :                         GDKsyserror("MT_mremap(%s,%p,%zu,%zu): open failed\n",
     411             :                                     path, old_address, old_size, *new_size);
     412           0 :                         return NULL;
     413             :                 }
     414         561 :                 if (GDKextendf(fd, *new_size, path) != GDK_SUCCEED) {
     415           0 :                         close(fd);
     416           0 :                         TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): GDKextendf() failed\n", path, old_address, old_size, *new_size);
     417           0 :                         return NULL;
     418             :                 }
     419             : #ifdef HAVE_MREMAP
     420             :                 /* on Linux it's easy */
     421         561 :                 p = mremap(old_address, old_size, *new_size, MREMAP_MAYMOVE);
     422             : #ifdef HAVE_VALGRIND
     423             :                 if (p != MAP_FAILED) {
     424             :                         if (p == old_address) {
     425             :                                 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
     426             :                         } else {
     427             :                                 VALGRIND_FREELIKE_BLOCK(old_address, 0);
     428             :                                 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
     429             :                         }
     430             :                 }
     431             : #endif
     432             : #else
     433             :                 /* try to map extension at end of current map */
     434             :                 p = mmap((char *) old_address + old_size, *new_size - old_size,
     435             :                          prot, flags, fd, old_size);
     436             :                 /* if it failed and MAP_FIXED_NOREPLACE is not defined,
     437             :                  * there is no point trying a full mmap: that too won't
     438             :                  * fit either (if MAP_FIXED_NOREPLACE, only relevant
     439             :                  * failure is with EEXIST) */
     440             :                 if (p != MAP_FAILED || errno == EEXIST) {
     441             :                         if (p == (char *) old_address + old_size) {
     442             :                                 /* we got the requested address, make
     443             :                                  * sure we return the correct (old)
     444             :                                  * address */
     445             :                                 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
     446             :                                 p = old_address;
     447             :                         } else {
     448             :                                 /* we got some other address: discard
     449             :                                  * it and make full mmap */
     450             :                                 if (p != MAP_FAILED &&
     451             :                                     munmap(p, *new_size - old_size) < 0)
     452             :                                         GDKsyserror("munmap");
     453             : #ifdef NO_MMAP_ALIASING
     454             :                                 if (msync(old_address, old_size, MS_SYNC) < 0)
     455             :                                         GDKsyserror("msync");
     456             : #endif
     457             :                                 /* first create full mmap, then, if
     458             :                                  * successful, remove old mmap */
     459             : #ifdef MAP_FIXED_NOREPLACE
     460             :                                 flags &= ~MAP_FIXED_NOREPLACE;
     461             : #endif
     462             :                                 p = mmap(NULL, *new_size, prot, flags, fd, 0);
     463             :                                 if (p != MAP_FAILED) {
     464             :                                         VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
     465             :                                         if (munmap(old_address, old_size) < 0)
     466             :                                                 GDKsyserror("munmap");
     467             :                                         VALGRIND_FREELIKE_BLOCK(old_address, 0);
     468             :                                 }
     469             :                         }
     470             :                 }
     471             : #endif  /* HAVE_MREMAP */
     472             :         } else {
     473             :                 /* "copy-on-write" or "anonymous" memory map */
     474             : #ifdef MAP_ANONYMOUS
     475           0 :                 flags |= MAP_ANONYMOUS;
     476             : #else
     477             :                 if ((fd = open("/dev/zero", O_RDWR | O_CLOEXEC)) < 0) {
     478             :                         GDKsyserror("MT_mremap(%s,%p,%zu,%zu): "
     479             :                                     "open('/dev/zero') failed\n",
     480             :                                     path ? path : "NULL", old_address,
     481             :                                     old_size, *new_size);
     482             :                         return NULL;
     483             :                 }
     484             : #endif
     485             :                 /* try to map an anonymous area as extent to the
     486             :                  * current map */
     487           0 :                 p = mmap((char *) old_address + old_size, *new_size - old_size,
     488             :                          prot, flags, fd, 0);
     489             :                 /* no point trying a full map if this didn't work:
     490             :                  * there isn't enough space */
     491           0 :                 if (p != MAP_FAILED || errno == EEXIST) {
     492           0 :                         if (p == (char *) old_address + old_size) {
     493             :                                 /* we got the requested address, make
     494             :                                  * sure we return the correct (old)
     495             :                                  * address */
     496             :                                 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
     497             :                                 p = old_address;
     498             :                         } else {
     499             :                                 /* we got some other address: discard
     500             :                                  * it and make full mmap */
     501           0 :                                 if (p != MAP_FAILED &&
     502           0 :                                     munmap(p, *new_size - old_size) < 0)
     503           0 :                                         GDKsyserror("munmap");
     504             : #ifdef MAP_FIXED_NOREPLACE
     505           0 :                                 flags &= ~MAP_FIXED_NOREPLACE;
     506             : #endif
     507             : #ifdef HAVE_MREMAP
     508             :                                 /* first get an area large enough for
     509             :                                  * *new_size */
     510           0 :                                 p = mmap(NULL, *new_size, prot, flags, fd, 0);
     511           0 :                                 if (p != MAP_FAILED) {
     512             :                                         /* then overlay old mmap over new */
     513           0 :                                         void *q;
     514             : 
     515           0 :                                         q = mremap(old_address, old_size,
     516             :                                                    old_size,
     517             :                                                    MREMAP_FIXED | MREMAP_MAYMOVE,
     518             :                                                    p);
     519           0 :                                         assert(q == p || q == MAP_FAILED);
     520           0 :                                         if (q == MAP_FAILED) {
     521           0 :                                                 int e = errno;
     522             :                                                 /* we didn't expect this... */
     523           0 :                                                 if (munmap(p, *new_size) < 0)
     524           0 :                                                         GDKsyserror("munmap");
     525           0 :                                                 p = MAP_FAILED;
     526           0 :                                                 errno = e;
     527             :                                         }
     528             : #ifdef HAVE_VALGRIND
     529             :                                         else {
     530             :                                                 VALGRIND_FREELIKE_BLOCK(old_size, 0);
     531             :                                                 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
     532             :                                         }
     533             : #endif
     534             :                                 }
     535             : #else
     536             :                                 p = MAP_FAILED;
     537             :                                 if (path == NULL ||
     538             :                                     *new_size <= GDK_mmap_minsize_persistent) {
     539             :                                         /* size not too big yet or
     540             :                                          * anonymous, try to make new
     541             :                                          * anonymous mmap and copy
     542             :                                          * data over */
     543             :                                         p = mmap(NULL, *new_size, prot, flags,
     544             :                                                  fd, 0);
     545             :                                         if (p != MAP_FAILED) {
     546             :                                                 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 0);
     547             :                                                 memcpy(p, old_address,
     548             :                                                        old_size);
     549             :                                                 munmap(old_address, old_size);
     550             :                                                 VALGRIND_FREELIKE_BLOCK(old_address, 0);
     551             :                                         }
     552             :                                         /* if it failed, try alternative */
     553             :                                 }
     554             :                                 if (p == MAP_FAILED && path != NULL) {
     555             :                                         /* write data to disk, then
     556             :                                          * mmap it to new address */
     557             :                                         if (fd >= 0)
     558             :                                                 close(fd);
     559             :                                         fd = -1;
     560             :                                         p = malloc(strlen(path) + 5);
     561             :                                         if (p == NULL){
     562             :                                                 GDKsyserror("MT_mremap(%s,%p,%zu,%zu): fd < 0\n", path, old_address, old_size, *new_size);
     563             :                                                 return NULL;
     564             :                                         }
     565             : 
     566             :                                         strcat(strcpy(p, path), ".tmp");
     567             :                                         fd = open(p, O_RDWR | O_CREAT | O_CLOEXEC,
     568             :                                                   MONETDB_MODE);
     569             :                                         if (fd < 0) {
     570             :                                                 GDKsyserror("MT_mremap(%s,%p,%zu,%zu): fd < 0\n", path, old_address, old_size, *new_size);
     571             :                                                 free(p);
     572             :                                                 return NULL;
     573             :                                         }
     574             :                                         free(p);
     575             :                                         if (write(fd, old_address,
     576             :                                                   old_size) < 0 ||
     577             : #ifdef HAVE_FALLOCATE
     578             :                                             /* prefer Linux-specific
     579             :                                              * fallocate over standard
     580             :                                              * posix_fallocate, since
     581             :                                              * glibc uses a rather
     582             :                                              * slow method of
     583             :                                              * allocating the file if
     584             :                                              * the file system doesn't
     585             :                                              * support the operation,
     586             :                                              * we just use ftruncate
     587             :                                              * in that case */
     588             :                                             (fallocate(fd, 0, (off_t) old_size, (off_t) *new_size - (off_t) old_size) < 0 && (errno != EOPNOTSUPP || ftruncate(fd, (off_t) *new_size) < 0))
     589             : #else
     590             : #ifdef HAVE_POSIX_FALLOCATE
     591             :                                             /* posix_fallocate returns
     592             :                                              * error number on
     593             :                                              * failure, not -1, and if
     594             :                                              * it returns EINVAL, the
     595             :                                              * underlying file system
     596             :                                              * may not support the
     597             :                                              * operation, so we then
     598             :                                              * need to try
     599             :                                              * ftruncate */
     600             :                                             ((errno = posix_fallocate(fd, (off_t) old_size, (off_t) *new_size - (off_t) old_size)) == EINVAL ? ftruncate(fd, (off_t) *new_size) < 0 : errno != 0)
     601             : #else
     602             :                                             ftruncate(fd, (off_t) *new_size) < 0
     603             : #endif
     604             : #endif
     605             :                                                 ) {
     606             :                                                 GDKsyserror("MT_mremap(%s,%p,%zu,%zu): write() or "
     607             : #ifdef HAVE_FALLOCATE
     608             :                                                             "fallocate()"
     609             : #else
     610             : #ifdef HAVE_POSIX_FALLOCATE
     611             :                                                             "posix_fallocate()"
     612             : #else
     613             :                                                             "ftruncate()"
     614             : #endif
     615             : #endif
     616             :                                                             " failed\n", path, old_address, old_size, *new_size);
     617             :                                                 /* extending failed:
     618             :                                                  * free any disk space
     619             :                                                  * allocated in the
     620             :                                                  * process */
     621             :                                                 if (ftruncate(fd, (off_t) old_size) < 0)
     622             :                                                         GDKsyserror("MT_mremap(%s,%p,%zu,%zu): ftruncate() failed\n", path, old_address, old_size, *new_size);
     623             :                                                 close(fd);
     624             :                                                 return NULL;
     625             :                                         }
     626             :                                         p = mmap(NULL, *new_size, prot, flags,
     627             :                                                  fd, 0);
     628             :                                         if (p != MAP_FAILED) {
     629             :                                                 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
     630             :                                                 munmap(old_address, old_size);
     631             :                                                 VALGRIND_FREELIKE_BLOCK(old_address, 0);
     632             :                                         }
     633             :                                 }
     634             : #endif  /* HAVE_MREMAP */
     635             :                         }
     636             :                 }
     637             :         }
     638         561 :         if (p == MAP_FAILED)
     639           0 :                 GDKsyserror("MT_mremap(%s,%p,%zu,%zu): p == MAP_FAILED\n", path?path:"NULL", old_address, old_size, *new_size);
     640         561 :         if (fd >= 0)
     641         561 :                 close(fd);
     642         561 :         return p == MAP_FAILED ? NULL : p;
     643             : }
     644             : 
     645             : int
     646         208 : MT_msync(void *p, size_t len)
     647             : {
     648         208 :         int ret = msync(p, len, MS_SYNC);
     649             : 
     650         208 :         if (ret < 0)
     651           0 :                 GDKsyserror("msync failed\n");
     652         208 :         return ret;
     653             : }
     654             : 
     655             : bool
     656    23306747 : MT_path_absolute(const char *pathname)
     657             : {
     658    23306747 :         return (*pathname == DIR_SEP);
     659             : }
     660             : 
     661             : #ifdef HAVE_DLFCN_H
     662             : # include <dlfcn.h>
     663             : #endif
     664             : 
     665             : void *
     666           0 : mdlopen(const char *library, int mode)
     667             : {
     668           0 :         (void)library; /* Not used because of MacOs not handling dlopen on linked library */
     669           0 :         return dlopen(NULL, mode);
     670             : }
     671             : 
     672             : #else /* WIN32 native */
     673             : 
     674             : #ifndef BUFSIZ
     675             : #define BUFSIZ 1024
     676             : #endif
     677             : 
     678             : #undef _errno
     679             : 
     680             : #include <windows.h>
     681             : 
     682             : #ifdef _MSC_VER
     683             : #include <io.h>
     684             : #endif /* _MSC_VER */
     685             : #include <Psapi.h>
     686             : 
     687             : #define MT_SMALLBLOCK 256
     688             : 
     689             : static LONG WINAPI
     690             : MT_ignore_exceptions(struct _EXCEPTION_POINTERS *ExceptionInfo)
     691             : {
     692             :         (void) ExceptionInfo;
     693             :         return EXCEPTION_EXECUTE_HANDLER;
     694             : }
     695             : 
     696             : void
     697             : MT_init_posix(void)
     698             : {
     699             :         SetUnhandledExceptionFilter(MT_ignore_exceptions);
     700             : }
     701             : 
     702             : size_t
     703             : MT_getrss(void)
     704             : {
     705             :         PROCESS_MEMORY_COUNTERS ctr;
     706             :         if (GetProcessMemoryInfo(GetCurrentProcess(), &ctr, sizeof(ctr)))
     707             :                 return ctr.WorkingSetSize;
     708             :         return 0;
     709             : }
     710             : 
     711             : /* Windows mmap keeps a global list of base addresses for complex
     712             :  * (remapped) memory maps the reason is that each remapped segment
     713             :  * needs to be unmapped separately in the end. */
     714             : 
     715             : void *
     716             : MT_mmap(const char *path, int mode, size_t len)
     717             : {
     718             :         DWORD mode0 = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
     719             :         DWORD mode1 = FILE_SHARE_READ | FILE_SHARE_WRITE;
     720             :         DWORD mode2 = mode & MMAP_ADVISE;
     721             :         DWORD mode3 = PAGE_READONLY;
     722             :         int mode4 = FILE_MAP_READ;
     723             :         SECURITY_ATTRIBUTES sa;
     724             :         HANDLE h1, h2;
     725             :         void *ret;
     726             :         wchar_t *wpath = utf8towchar(path);
     727             :         if (wpath == NULL)
     728             :                 return NULL;
     729             : 
     730             :         if (mode & MMAP_WRITE) {
     731             :                 mode0 |= FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA;
     732             :         }
     733             :         if (mode2 == MMAP_RANDOM || mode2 == MMAP_DONTNEED) {
     734             :                 mode2 = FILE_FLAG_RANDOM_ACCESS;
     735             :         } else if (mode2 == MMAP_SEQUENTIAL || mode2 == MMAP_WILLNEED) {
     736             :                 mode2 = FILE_FLAG_SEQUENTIAL_SCAN;
     737             :         } else {
     738             :                 mode2 = FILE_FLAG_NO_BUFFERING;
     739             :         }
     740             :         if (mode & MMAP_SYNC) {
     741             :                 mode2 |= FILE_FLAG_WRITE_THROUGH;
     742             :         }
     743             :         if (mode & MMAP_COPY) {
     744             :                 mode3 = PAGE_WRITECOPY;
     745             :                 mode4 = FILE_MAP_COPY;
     746             :         } else if (mode & MMAP_WRITE) {
     747             :                 mode3 = PAGE_READWRITE;
     748             :                 mode4 = FILE_MAP_WRITE;
     749             :         }
     750             :         mode2 |= FILE_ATTRIBUTE_NOT_CONTENT_INDEXED;
     751             :         sa.nLength = sizeof(SECURITY_ATTRIBUTES);
     752             :         sa.bInheritHandle = TRUE;
     753             :         sa.lpSecurityDescriptor = 0;
     754             : 
     755             :         h1 = CreateFileW(wpath, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
     756             :         if (h1 == INVALID_HANDLE_VALUE) {
     757             :                 (void) SetFileAttributesW(wpath, FILE_ATTRIBUTE_NORMAL);
     758             :                 h1 = CreateFileW(wpath, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
     759             :                 if (h1 == INVALID_HANDLE_VALUE) {
     760             :                         free(wpath);
     761             :                         GDKwinerror("CreateFile('%s', %lu, %lu, &sa, %lu, %lu, NULL) failed\n",
     762             :                                     path, (unsigned long) mode0, (unsigned long) mode1, (unsigned long) OPEN_ALWAYS, (unsigned long) mode2);
     763             :                         return NULL;
     764             :                 }
     765             :         }
     766             :         free(wpath);
     767             : 
     768             :         h2 = CreateFileMapping(h1, &sa, mode3, (DWORD) (((__int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)), (DWORD) (len & LL_CONSTANT(0xFFFFFFFF)), NULL);
     769             :         if (h2 == NULL) {
     770             :                 GDKwinerror("CreateFileMapping(%p, &sa, %lu, %lu, %lu, NULL) failed\n",
     771             :                             h1, (unsigned long) mode3,
     772             :                             (unsigned long) (((unsigned __int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)),
     773             :                             (unsigned long) (len & LL_CONSTANT(0xFFFFFFFF)));
     774             :                 CloseHandle(h1);
     775             :                 return NULL;
     776             :         }
     777             :         CloseHandle(h1);
     778             : 
     779             :         ret = MapViewOfFileEx(h2, mode4, (DWORD) 0, (DWORD) 0, len, NULL);
     780             :         if (ret == NULL)
     781             :                 errno = winerror(GetLastError());
     782             :         CloseHandle(h2);
     783             : 
     784             :         return ret;
     785             : }
     786             : 
     787             : int
     788             : MT_munmap(void *p, size_t dummy)
     789             : {
     790             :         int ret;
     791             : 
     792             :         (void) dummy;
     793             :         /*       Windows' UnmapViewOfFile returns success!=0, error== 0,
     794             :          * while Unix's   munmap          returns success==0, error==-1. */
     795             :         ret = UnmapViewOfFile(p);
     796             :         if (ret == 0) {
     797             :                 GDKwinerror("UnmapViewOfFile failed\n");
     798             :                 return -1;
     799             :         }
     800             :         return 0;
     801             : }
     802             : 
     803             : void *
     804             : MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
     805             : {
     806             :         void *p;
     807             : 
     808             :         /* doesn't make sense for us to extend read-only memory map */
     809             :         assert(mode & MMAP_WRITABLE);
     810             : 
     811             :         /* round up to multiple of page size */
     812             :         *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
     813             : 
     814             :         if (old_size >= *new_size) {
     815             :                 *new_size = old_size;
     816             :                 return old_address;     /* don't bother shrinking */
     817             :         }
     818             :         if (GDKextend(path, *new_size) != GDK_SUCCEED) {
     819             :                 TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): GDKextend() failed\n", path?path:"NULL", old_address, old_size, *new_size);
     820             :                 return NULL;
     821             :         }
     822             :         if (path && !(mode & MMAP_COPY))
     823             :                 MT_munmap(old_address, old_size);
     824             :         p = MT_mmap(path, mode, *new_size);
     825             :         if (p != NULL && (path == NULL || (mode & MMAP_COPY))) {
     826             :                 memcpy(p, old_address, old_size);
     827             :                 MT_munmap(old_address, old_size);
     828             :         }
     829             : 
     830             :         if (p == NULL)
     831             :                 TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): p == NULL\n", path?path:"NULL", old_address, old_size, *new_size);
     832             :         return p;
     833             : }
     834             : 
     835             : int
     836             : MT_msync(void *p, size_t len)
     837             : {
     838             :         int ret;
     839             : 
     840             :         /*       Windows' FlushViewOfFile returns success!=0, error== 0,
     841             :          * while Unix's   munmap          returns success==0, error==-1. */
     842             :         ret = FlushViewOfFile(p, len);
     843             :         if (ret == 0) {
     844             :                 GDKwinerror("FlushViewOfFile failed\n");
     845             :                 return -1;
     846             :         }
     847             :         return 0;
     848             : }
     849             : 
     850             : bool
     851             : MT_path_absolute(const char *pathname)
     852             : {
     853             :         /* drive letter, colon, directory separator */
     854             :         return (((('a' <= pathname[0] && pathname[0] <= 'z') ||
     855             :                   ('A' <= pathname[0] && pathname[0] <= 'Z')) &&
     856             :                  pathname[1] == ':' &&
     857             :                  (pathname[2] == '/' || pathname[2] == '\\')) ||
     858             :                 (pathname[0] == '\\')); // && pathname[1] == '\\'));
     859             : }
     860             : 
     861             : #ifndef HAVE_GETTIMEOFDAY
     862             : static int nodays[12] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
     863             : 
     864             : #define LEAPYEAR(y) ((((y)%4)==0 && ((y)%100)!=0) || ((y)%400)==0)
     865             : #define NODAYS(m,y) (((m)!=2)?nodays[(m)-1]:LEAPYEAR(y)?29:28)
     866             : 
     867             : int
     868             : gettimeofday(struct timeval *tv, int *ignore_zone)
     869             : {
     870             :         unsigned int year, day, month;
     871             :         SYSTEMTIME st;
     872             : 
     873             :         (void) ignore_zone;
     874             :         GetSystemTime(&st);
     875             :         day = 0;
     876             :         for (year = 1970; year < st.wYear; year++)
     877             :                 day += LEAPYEAR(year) ? 366 : 365;
     878             : 
     879             :         for (month = 1; month < st.wMonth; month++)
     880             :                 day += NODAYS(month, st.wYear);
     881             : 
     882             :         day += st.wDay;
     883             :         tv->tv_sec = 60 * (day * 24 * 60 + st.wMinute) + st.wSecond;
     884             :         tv->tv_usec = 1000 * st.wMilliseconds;
     885             :         return 0;
     886             : }
     887             : #endif
     888             : 
     889             : void *
     890             : mdlopen(const char *file, int mode)
     891             : {
     892             :         return dlopen(file, mode);
     893             : }
     894             : 
     895             : void *
     896             : dlopen(const char *file, int mode)
     897             : {
     898             :         (void) mode;
     899             :         if (file != NULL) {
     900             :                 wchar_t *wfile = utf8towchar(file);
     901             :                 if (wfile == NULL)
     902             :                         return NULL;
     903             :                 void *ret = LoadLibraryW(wfile);
     904             :                 free(wfile);
     905             :                 return ret;
     906             :         }
     907             :         return GetModuleHandle(NULL);
     908             : }
     909             : 
     910             : int
     911             : dlclose(void *handle)
     912             : {
     913             :         if (handle != NULL) {
     914             :                 return FreeLibrary((HINSTANCE) handle);
     915             :         }
     916             :         return -1;
     917             : }
     918             : 
     919             : void *
     920             : dlsym(void *handle, const char *name)
     921             : {
     922             :         if (handle != NULL) {
     923             :                 return (void *) GetProcAddress((HINSTANCE) handle, name);
     924             :         }
     925             :         return NULL;
     926             : }
     927             : 
     928             : char *
     929             : dlerror(void)
     930             : {
     931             :         static char msg[1024];
     932             : 
     933             :         FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), 0, msg, sizeof(msg), NULL);
     934             :         return msg;
     935             : }
     936             : #endif
     937             : 
     938             : void
     939      439524 : MT_sleep_ms(unsigned int ms)
     940             : {
     941             : #ifdef NATIVE_WIN32
     942             :         Sleep(ms);
     943             : #else
     944             : #ifdef HAVE_NANOSLEEP
     945      439524 :         (void) nanosleep(&(struct timespec) {.tv_sec = ms / 1000,
     946      439524 :                                 .tv_nsec = ms == 1 ? 1000 : (long) (ms % 1000) * 1000000,},
     947             :                 NULL);
     948             : #else
     949             :         (void) select(0, NULL, NULL, NULL,
     950             :                       &(struct timeval) {.tv_sec = ms / 1000,
     951             :                                       .tv_usec = ms == 1 ? 1 : (ms % 1000) * 1000,});
     952             : #endif
     953             : #endif
     954      439514 : }
     955             : 
     956             : #if !defined(HAVE_LOCALTIME_R) || !defined(HAVE_GMTIME_R) || !defined(HAVE_ASCTIME_R) || !defined(HAVE_CTIME_R)
     957             : static MT_Lock timelock = MT_LOCK_INITIALIZER(timelock);
     958             : #endif
     959             : 
     960             : #ifndef HAVE_LOCALTIME_R
     961             : struct tm *
     962             : localtime_r(const time_t *restrict timep, struct tm *restrict result)
     963             : {
     964             :         struct tm *tmp;
     965             :         MT_lock_set(&timelock);
     966             :         tmp = localtime(timep);
     967             :         if (tmp)
     968             :                 *result = *tmp;
     969             :         MT_lock_unset(&timelock);
     970             :         return tmp ? result : NULL;
     971             : }
     972             : #endif
     973             : 
     974             : #ifndef HAVE_GMTIME_R
     975             : struct tm *
     976             : gmtime_r(const time_t *restrict timep, struct tm *restrict result)
     977             : {
     978             :         struct tm *tmp;
     979             :         MT_lock_set(&timelock);
     980             :         tmp = gmtime(timep);
     981             :         if (tmp)
     982             :                 *result = *tmp;
     983             :         MT_lock_unset(&timelock);
     984             :         return tmp ? result : NULL;
     985             : }
     986             : #endif
     987             : 
     988             : #ifndef HAVE_ASCTIME_R
     989             : char *
     990             : asctime_r(const struct tm *restrict tm, char *restrict buf)
     991             : {
     992             :         char *tmp;
     993             :         MT_lock_set(&timelock);
     994             :         tmp = asctime(tm);
     995             :         if (tmp)
     996             :                 strcpy(buf, tmp);
     997             :         MT_lock_unset(&timelock);
     998             :         return tmp ? buf : NULL;
     999             : }
    1000             : #endif
    1001             : 
    1002             : #ifndef HAVE_CTIME_R
    1003             : char *
    1004             : ctime_r(const time_t *restrict t, char *restrict buf)
    1005             : {
    1006             :         char *tmp;
    1007             :         MT_lock_set(&timelock);
    1008             :         tmp = ctime(t);
    1009             :         if (tmp)
    1010             :                 strcpy(buf, tmp);
    1011             :         MT_lock_unset(&timelock);
    1012             :         return tmp ? buf : NULL;
    1013             : }
    1014             : #endif
    1015             : 
    1016             : #if !defined(HAVE_STRERROR_R) && !defined(HAVE_STRERROR_S)
    1017             : static MT_Lock strerrlock = MT_LOCK_INITIALIZER(strerrlock);
    1018             : 
    1019             : int
    1020             : strerror_r(int errnum, char *buf, size_t buflen)
    1021             : {
    1022             :         char *msg;
    1023             :         MT_lock_set(&strerrlock);
    1024             :         msg = strerror(errnum);
    1025             :         strcpy_len(buf, msg, buflen);
    1026             :         MT_lock_unset(&strerrlock);
    1027             :         return 0;
    1028             : }
    1029             : #endif

Generated by: LCOV version 1.14