Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024, 2025 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * @a Niels Nes, Peter Boncz
15 : * @* System Independent Layer
16 : *
17 : * GDK is built on Posix. Exceptions are made for memory mapped files
18 : * and anonymous virtual memory, for which somewhat higher-level
19 : * functions are defined here. Most of this file concerns itself with
20 : * emulation of Posix functionality on the WIN32 native platform.
21 : */
22 : #include "monetdb_config.h"
23 : #include "gdk.h" /* includes gdk_posix.h */
24 : #include "gdk_private.h"
25 : #include "mutils.h"
26 : #include <unistd.h>
27 : #include <string.h> /* strncpy */
28 :
29 : #ifdef HAVE_FCNTL_H
30 : # include <fcntl.h>
31 : #endif
32 : #ifdef HAVE_PROCFS_H
33 : # include <procfs.h>
34 : #endif
35 : #ifdef HAVE_MACH_TASK_H
36 : # include <mach/task.h>
37 : #endif
38 : #ifdef HAVE_MACH_MACH_INIT_H
39 : # include <mach/mach_init.h>
40 : #endif
41 : #if defined(HAVE_KVM_H)
42 : # include <kvm.h>
43 : # include <sys/param.h>
44 : # include <sys/sysctl.h>
45 : # include <sys/user.h>
46 : #endif
47 :
48 : #if defined(__GNUC__) && defined(HAVE_VALGRIND)
49 : #include <valgrind.h>
50 : #else
51 : #define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)
52 : #define VALGRIND_FREELIKE_BLOCK(addr, rzB)
53 : #define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)
54 : #endif
55 :
56 : #ifndef MAP_NORESERVE
57 : # define MAP_NORESERVE MAP_PRIVATE
58 : #endif
59 : #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
60 : #define MAP_ANONYMOUS MAP_ANON
61 : #endif
62 :
63 : #define MMAP_ADVISE 7
64 : #define MMAP_WRITABLE (MMAP_WRITE|MMAP_COPY)
65 :
66 : #ifndef O_CLOEXEC
67 : #ifdef _O_NOINHERIT
68 : #define O_CLOEXEC _O_NOINHERIT /* Windows */
69 : #else
70 : #define O_CLOEXEC 0
71 : #endif
72 : #endif
73 :
74 : /* Crude VM buffer management that keep a list of all memory mapped
75 : * regions.
76 : *
77 : * a.k.a. "helping stupid VM implementations that ignore VM advice"
78 : *
79 : * The main goal is to be able to tell the OS to please stop buffering
80 : * all memory mapped pages when under pressure. A major problem is
81 : * materialization of large results in newly created memory mapped
82 : * files. Operating systems tend to cache all dirty pages, such that
83 : * when memory is out, all pages are dirty and cannot be unloaded
84 : * quickly. The VM panic occurs and comatose OS states may be
85 : * observed. This is in spite of our use of
86 : * madvise(MADV_SEQUENTIAL). That is; we would want that the OS drops
87 : * pages after we've passed them. That does not happen; pages are
88 : * retained and pollute the buffer cache.
89 : *
90 : * Regrettably, at this level, we don't know anything about how Monet
91 : * is using the mmapped regions. Monet code is totally oblivious of
92 : * any I/O; that's why it is so easy to create CPU efficient code in
93 : * Monet.
94 : *
95 : * The current solution focuses on large writable maps. These often
96 : * represent newly created BATs, that are the result of some (running)
97 : * operator. We assume two things here:
98 : * - the BAT is created in sequential fashion (always almost true)
99 : * - afterwards, this BAT is used in sequential fashion (often true)
100 : *
101 : * A VMtrim thread keeps an eye on the RSS (memory pressure) and large
102 : * writable memory maps. If RSS approaches mem_maxsize(), it starts to
103 : * *worry*, and starts to write dirty data from these writable maps to
104 : * disk in 128MB tiles. So, if memory pressure rises further in the
105 : * near future, the OS has some option to release memory pages cheaply
106 : * (i.e. without needing I/O). This is also done explicitly by the
107 : * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS
108 : * to release pages. The reason is that Linux is not smart enough to
109 : * do even this. Anyway..
110 : *
111 : * The way to free pages explicitly in Linux is to call
112 : * posix_fadvise(..,MADV_DONTNEED). Particularly,
113 : * posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and
114 : * documented doesn't work on Linux. But we do both posix_madvise and
115 : * posix_fadvise, so on other unix systems that don't support
116 : * posix_fadvise, posix_madvise still might work. On Windows, to our
117 : * knowledge, there is no way to tell it stop buffering a memory
118 : * mapped region. msync (FlushViewOfFile) does work, though. So let's
119 : * hope the VM paging algorithm behaves better than Linux which just
120 : * runs off the cliff and if MonetDB does not prevent RSS from being
121 : * too high, enters coma.
122 : *
123 : * We will only be able to sensibly test this on Windows64. On
124 : * Windows32, mmap sizes do not significantly exceed RAM sizes so
125 : * MonetDB swapping actually will not happen (of course, you've got
126 : * this nasty problem of VM fragemntation and failing mmaps instead).
127 : *
128 : * In principle, page tiles are saved sequentially, and behind it, but
129 : * never overtaking it, is an "unload-cursor" that frees the pages if
130 : * that is needed to keep RSS down. There is a tweak in the
131 : * algorithm, that re-sets the unload-cursor if it seems that all
132 : * tiles to the end have been saved (whether a tile is actually saved
133 : * is determined by timing the sync action). This means that the
134 : * producing operator is ready creating the BAT, and we assume it is
135 : * going to be used sequentially afterwards. In that case, we should
136 : * start unloading right after the 'read-cursor', that is, from the
137 : * start.
138 : *
139 : * EXAMPLE
140 : * D = dirty tile
141 : * s = saved tile (i.e. clean)
142 : * u = unloaded tile
143 : * L = tile that is being loaded
144 : *
145 : * +--> operator produces BAT
146 : * (1) DDDDDD|......................................| end of reserved mmap
147 : * ____|RSS
148 : * |
149 : * | at 3/4 of RSS consumed we start to worry
150 : * +--> operator produces BAT
151 : * (2) DDDDDDDDDDDDDDDD|............................|
152 : * s<----------------------------- VM backwards save thread
153 : * |
154 : * + first tile of which saving costs anything
155 : *
156 : * +--> operator produces BAT
157 : * (3) DDDDDDDDDDDDDDDss|D|.........................|
158 : * VM-thread save ->|
159 : *
160 : * When the RSS target is exceeded, we start unloading tiles..
161 : *
162 : * +--> VM-thread unload starts at *second* 's'
163 : * |
164 : * | +--> operator produces BAT
165 : * (4) DDDDDDDDDDDDDDDsus|DD|........................|
166 : * VM-thread save -->| | RSS = Full!
167 : *
168 : * +-- 0 => save costs nothing!!
169 : * VM-thread save ------------->| assume bat complete
170 : * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
171 : * |<-------- re-set unload cursor
172 : * +--- first tile was not unloaded.
173 : *
174 : * later.. some other operator sequentially reads the bat
175 : * first part is 'D', that is, nicely cached.
176 : *
177 : * ---read------->|
178 : * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
179 : *
180 : * now we're hitting the unloaded region. the query becomes
181 : * I/O read bound here (typically 20% CPU utilization).
182 : *
183 : * ---read-------->|
184 : * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................|
185 : * / \
186 : * unload cursor load cursor
187 : *
188 : * ---read---------------->|
189 : * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
190 : * / \
191 : * unload cursor load cursor
192 : *
193 : * ---read--------------------->| done
194 : * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
195 : * ****
196 : * last part still cached
197 : *
198 : * note: if we would not have re-setted the unload cursor (5)
199 : * the last part would have been lost due to continuing
200 : * RSS pressure from the 'L' read-cursor.
201 : *
202 : * If multiple write-mmaps exist, we do unload-tile and save-tile
203 : * selection on a round-robin basis among them.
204 : *
205 : * Of course, this is a simple solution for simple cases only.
206 : * (a) if the bat is produced too fast, (or your disk is too slow)
207 : * RSS will exceeds its limit and Linux will go into swapping.
208 : * (b) if your data is not produced and read sequentially.
209 : * Examples are sorting or clustering on huge datasets.
210 : * (c) if RSS pressure is due to large read-maps, rather than
211 : * intermediate results.
212 : *
213 : * Two crude suggestions:
214 : * - If we are under RSS pressure without unloadable tiles and with
215 : * saveable tiles, we should consider suspending *all* other threads
216 : * until we manage to unload a tile.
217 : * - if there are no saveable tiles (or in case of read-only maps)
218 : * we could resort to saving and unloading random tiles.
219 : *
220 : * To do better, our BAT algorithms should provide even more detailed
221 : * advice on their access patterns, which may even consist of pointers
222 : * to the cursors (i.e. pointers to b->batBuns->free or the cursors
223 : * in radix-cluster), which an enhanced version of this thread might
224 : * take into account.
225 : *
226 : * [Kersten] The memory map table should be aligned to the number of
227 : * mapped files. In more recent applications, such as the SkyServer
228 : * this may be around 2000 BATs easily.
229 : */
230 :
231 : #ifdef HAVE_PTHREAD_H
232 : /* pthread.h on Windows includes config.h if HAVE_CONFIG_H is set */
233 : #undef HAVE_CONFIG_H
234 : #include <sched.h>
235 : #include <pthread.h>
236 : #endif
237 : #ifdef HAVE_SEMAPHORE_H
238 : #include <semaphore.h>
239 : #endif
240 :
241 : #ifndef NATIVE_WIN32
242 : #ifdef HAVE_POSIX_FADVISE
243 : #ifdef HAVE_UNAME
244 : #include <sys/utsname.h>
245 : #endif
246 : #endif
247 :
248 : void
249 359 : MT_init_posix(void)
250 : {
251 359 : }
252 :
253 : /* return RSS in bytes */
254 : size_t
255 0 : MT_getrss(void)
256 : {
257 : #if defined(HAVE_PROCFS_H) && defined(__sun__)
258 : /* retrieve RSS the Solaris way (2.6+) */
259 : int fd;
260 : psinfo_t psbuff;
261 :
262 : fd = open("/proc/self/psinfo", O_RDONLY | O_CLOEXEC);
263 : if (fd >= 0) {
264 : if (read(fd, &psbuff, sizeof(psbuff)) == sizeof(psbuff)) {
265 : close(fd);
266 : return psbuff.pr_rssize * 1024;
267 : }
268 : close(fd);
269 : }
270 : #elif defined(HAVE_TASK_INFO)
271 : /* Darwin/MACH call for process' RSS */
272 : task_t task = mach_task_self();
273 : struct task_basic_info_64 t_info;
274 : mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_64_COUNT;
275 :
276 : if (task_info(task, TASK_BASIC_INFO_64, (task_info_t)&t_info, &t_info_count) != KERN_INVALID_POLICY)
277 : return t_info.resident_size; /* bytes */
278 : #elif defined(HAVE_KVM_H)
279 : /* get RSS on FreeBSD and NetBSD */
280 : struct kinfo_proc *ki;
281 : int ski = 1;
282 : kvm_t *kd;
283 : size_t rss = 0;
284 :
285 : kd = kvm_open(NULL, "/dev/null", NULL, O_RDONLY, "kvm_open");
286 : if (kd != NULL) {
287 : ki = kvm_getprocs(kd, KERN_PROC_PID, getpid(), &ski);
288 : if (ki != NULL) {
289 : #ifdef __NetBSD__ /* should we use configure for this? */
290 : /* see bug 3217 */
291 : rss = ki->kp_eproc.e_vm.vm_rssize;
292 : #else
293 : rss = ki->ki_rssize;
294 : #endif
295 : kvm_close(kd);
296 :
297 : return rss * MT_pagesize();
298 : } else {
299 : kvm_close(kd);
300 : }
301 : }
302 : #elif defined(__linux__)
303 : /* get RSS on Linux */
304 0 : int fd;
305 :
306 0 : fd = open("/proc/self/statm", O_RDONLY | O_CLOEXEC);
307 0 : if (fd >= 0) {
308 0 : char buf[1024];
309 0 : ssize_t sz = read(fd, buf, sizeof(buf) - 1);
310 :
311 0 : close(fd);
312 0 : if (sz > 0) {
313 0 : buf[sz] = 0;
314 0 : long rss;
315 0 : if (sscanf(buf, "%*d %ld", &rss) >= 1)
316 0 : return (size_t) rss * MT_pagesize();
317 : }
318 : }
319 : #endif
320 : return 0;
321 : }
322 :
323 : void *
324 2585 : MT_mmap(const char *path, int mode, size_t len)
325 : {
326 2585 : int fd;
327 2585 : void *ret;
328 :
329 2585 : fd = open(path, O_CREAT | ((mode & MMAP_WRITE) ? O_RDWR : O_RDONLY) | O_CLOEXEC, MONETDB_MODE);
330 2593 : if (fd < 0) {
331 0 : GDKsyserror("open %s failed\n", path);
332 0 : return NULL;
333 : }
334 2593 : ret = mmap(NULL,
335 : len,
336 2593 : ((mode & MMAP_WRITABLE) ? PROT_WRITE : 0) | PROT_READ,
337 2593 : (mode & MMAP_COPY) ? (MAP_PRIVATE | MAP_NORESERVE) : MAP_SHARED,
338 : fd,
339 : 0);
340 2612 : if (ret == MAP_FAILED) {
341 0 : GDKsyserror("mmap(%s,%zu) failed\n", path, len);
342 0 : ret = NULL;
343 : } else {
344 2612 : VALGRIND_MALLOCLIKE_BLOCK(ret, len, 0, 1);
345 : }
346 2612 : close(fd);
347 2612 : return ret;
348 : }
349 :
350 : int
351 2611 : MT_munmap(void *p, size_t len)
352 : {
353 2611 : int ret = munmap(p, len);
354 :
355 2612 : if (ret < 0)
356 0 : GDKsyserror("munmap(%p,%zu) failed\n", p, len);
357 2612 : VALGRIND_FREELIKE_BLOCK(p, 0);
358 2612 : return ret;
359 : }
360 :
361 : /* expand or shrink a memory map (ala realloc).
362 : * the address returned may be different from the address going in.
363 : * in case of failure, the old address is still mapped and NULL is returned.
364 : */
365 : void *
366 453 : MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
367 : {
368 453 : void *p;
369 453 : int fd = -1;
370 453 : int flags = mode & MMAP_COPY ? MAP_PRIVATE : MAP_SHARED;
371 453 : int prot = PROT_WRITE | PROT_READ;
372 :
373 : #ifdef MAP_FIXED_NOREPLACE
374 453 : flags |= MAP_FIXED_NOREPLACE;
375 : #endif
376 : /* round up to multiple of page size */
377 453 : *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
378 :
379 : /* doesn't make sense for us to extend read-only memory map */
380 453 : assert(mode & MMAP_WRITABLE);
381 :
382 453 : if (*new_size < old_size) {
383 : #ifndef __COVERITY__ /* hide this from static code analyzer */
384 : /* shrink */
385 0 : VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
386 0 : if (munmap((char *) old_address + *new_size,
387 : old_size - *new_size) < 0) {
388 0 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): munmap() failed\n", path?path:"NULL", old_address, old_size, *new_size);
389 : /* even though the system call failed, we
390 : * don't need to propagate the error up: the
391 : * address should still work in the same way
392 : * as it did before */
393 0 : return old_address;
394 : }
395 0 : if (path && truncate(path, *new_size) < 0)
396 0 : GDKwarning("truncate of %s failed: %s\n",
397 : path, GDKstrerror(errno, (char[64]){0}, 64));
398 : #endif /* !__COVERITY__ */
399 0 : return old_address;
400 : }
401 453 : if (*new_size == old_size) {
402 : /* do nothing */
403 : return old_address;
404 : }
405 :
406 453 : if (!(mode & MMAP_COPY) && path != NULL) {
407 : /* "normal" memory map */
408 :
409 453 : if ((fd = open(path, O_RDWR | O_CLOEXEC)) < 0) {
410 0 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): open failed\n",
411 : path, old_address, old_size, *new_size);
412 0 : return NULL;
413 : }
414 453 : if (GDKextendf(fd, *new_size, path) != GDK_SUCCEED) {
415 0 : close(fd);
416 0 : TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): GDKextendf() failed\n", path, old_address, old_size, *new_size);
417 0 : return NULL;
418 : }
419 : #ifdef HAVE_MREMAP
420 : /* on Linux it's easy */
421 453 : p = mremap(old_address, old_size, *new_size, MREMAP_MAYMOVE);
422 : #ifdef HAVE_VALGRIND
423 : if (p != MAP_FAILED) {
424 : if (p == old_address) {
425 : VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
426 : } else {
427 : VALGRIND_FREELIKE_BLOCK(old_address, 0);
428 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
429 : }
430 : }
431 : #endif
432 : #else
433 : /* try to map extension at end of current map */
434 : p = mmap((char *) old_address + old_size, *new_size - old_size,
435 : prot, flags, fd, old_size);
436 : /* if it failed and MAP_FIXED_NOREPLACE is not defined,
437 : * there is no point trying a full mmap: that too won't
438 : * fit either (if MAP_FIXED_NOREPLACE, only relevant
439 : * failure is with EEXIST) */
440 : if (p != MAP_FAILED || errno == EEXIST) {
441 : if (p == (char *) old_address + old_size) {
442 : /* we got the requested address, make
443 : * sure we return the correct (old)
444 : * address */
445 : VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
446 : p = old_address;
447 : } else {
448 : /* we got some other address: discard
449 : * it and make full mmap */
450 : if (p != MAP_FAILED &&
451 : munmap(p, *new_size - old_size) < 0)
452 : GDKsyserror("munmap");
453 : #ifdef NO_MMAP_ALIASING
454 : if (msync(old_address, old_size, MS_SYNC) < 0)
455 : GDKsyserror("msync");
456 : #endif
457 : /* first create full mmap, then, if
458 : * successful, remove old mmap */
459 : #ifdef MAP_FIXED_NOREPLACE
460 : flags &= ~MAP_FIXED_NOREPLACE;
461 : #endif
462 : p = mmap(NULL, *new_size, prot, flags, fd, 0);
463 : if (p != MAP_FAILED) {
464 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
465 : if (munmap(old_address, old_size) < 0)
466 : GDKsyserror("munmap");
467 : VALGRIND_FREELIKE_BLOCK(old_address, 0);
468 : }
469 : }
470 : }
471 : #endif /* HAVE_MREMAP */
472 : } else {
473 : /* "copy-on-write" or "anonymous" memory map */
474 : #ifdef MAP_ANONYMOUS
475 0 : flags |= MAP_ANONYMOUS;
476 : #else
477 : if ((fd = open("/dev/zero", O_RDWR | O_CLOEXEC)) < 0) {
478 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): "
479 : "open('/dev/zero') failed\n",
480 : path ? path : "NULL", old_address,
481 : old_size, *new_size);
482 : return NULL;
483 : }
484 : #endif
485 : /* try to map an anonymous area as extent to the
486 : * current map */
487 0 : p = mmap((char *) old_address + old_size, *new_size - old_size,
488 : prot, flags, fd, 0);
489 : /* no point trying a full map if this didn't work:
490 : * there isn't enough space */
491 0 : if (p != MAP_FAILED || errno == EEXIST) {
492 0 : if (p == (char *) old_address + old_size) {
493 : /* we got the requested address, make
494 : * sure we return the correct (old)
495 : * address */
496 : VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
497 : p = old_address;
498 : } else {
499 : /* we got some other address: discard
500 : * it and make full mmap */
501 0 : if (p != MAP_FAILED &&
502 0 : munmap(p, *new_size - old_size) < 0)
503 0 : GDKsyserror("munmap");
504 : #ifdef MAP_FIXED_NOREPLACE
505 0 : flags &= ~MAP_FIXED_NOREPLACE;
506 : #endif
507 : #ifdef HAVE_MREMAP
508 : /* first get an area large enough for
509 : * *new_size */
510 0 : p = mmap(NULL, *new_size, prot, flags, fd, 0);
511 0 : if (p != MAP_FAILED) {
512 : /* then overlay old mmap over new */
513 0 : void *q;
514 :
515 0 : q = mremap(old_address, old_size,
516 : old_size,
517 : MREMAP_FIXED | MREMAP_MAYMOVE,
518 : p);
519 0 : assert(q == p || q == MAP_FAILED);
520 0 : if (q == MAP_FAILED) {
521 0 : int e = errno;
522 : /* we didn't expect this... */
523 0 : if (munmap(p, *new_size) < 0)
524 0 : GDKsyserror("munmap");
525 0 : p = MAP_FAILED;
526 0 : errno = e;
527 : }
528 : #ifdef HAVE_VALGRIND
529 : else {
530 : VALGRIND_FREELIKE_BLOCK(old_size, 0);
531 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
532 : }
533 : #endif
534 : }
535 : #else
536 : p = MAP_FAILED;
537 : if (path == NULL ||
538 : *new_size <= GDK_mmap_minsize_persistent) {
539 : /* size not too big yet or
540 : * anonymous, try to make new
541 : * anonymous mmap and copy
542 : * data over */
543 : p = mmap(NULL, *new_size, prot, flags,
544 : fd, 0);
545 : if (p != MAP_FAILED) {
546 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 0);
547 : memcpy(p, old_address,
548 : old_size);
549 : munmap(old_address, old_size);
550 : VALGRIND_FREELIKE_BLOCK(old_address, 0);
551 : }
552 : /* if it failed, try alternative */
553 : }
554 : if (p == MAP_FAILED && path != NULL) {
555 : /* write data to disk, then
556 : * mmap it to new address */
557 : if (fd >= 0)
558 : close(fd);
559 : fd = -1;
560 : p = malloc(strlen(path) + 5);
561 : if (p == NULL){
562 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): fd < 0\n", path, old_address, old_size, *new_size);
563 : return NULL;
564 : }
565 :
566 : stpcpy(stpcpy(p, path), ".tmp");
567 : fd = open(p, O_RDWR | O_CREAT | O_CLOEXEC,
568 : MONETDB_MODE);
569 : if (fd < 0) {
570 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): fd < 0\n", path, old_address, old_size, *new_size);
571 : free(p);
572 : return NULL;
573 : }
574 : free(p);
575 : if (write(fd, old_address,
576 : old_size) < 0 ||
577 : #ifdef HAVE_FALLOCATE
578 : /* prefer Linux-specific
579 : * fallocate over standard
580 : * posix_fallocate, since
581 : * glibc uses a rather
582 : * slow method of
583 : * allocating the file if
584 : * the file system doesn't
585 : * support the operation,
586 : * we just use ftruncate
587 : * in that case */
588 : (fallocate(fd, 0, (off_t) old_size, (off_t) *new_size - (off_t) old_size) < 0 && (errno != EOPNOTSUPP || ftruncate(fd, (off_t) *new_size) < 0))
589 : #else
590 : #ifdef HAVE_POSIX_FALLOCATE
591 : /* posix_fallocate returns
592 : * error number on
593 : * failure, not -1, and if
594 : * it returns EINVAL, the
595 : * underlying file system
596 : * may not support the
597 : * operation, so we then
598 : * need to try
599 : * ftruncate */
600 : ((errno = posix_fallocate(fd, (off_t) old_size, (off_t) *new_size - (off_t) old_size)) == EINVAL ? ftruncate(fd, (off_t) *new_size) < 0 : errno != 0)
601 : #else
602 : ftruncate(fd, (off_t) *new_size) < 0
603 : #endif
604 : #endif
605 : ) {
606 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): write() or "
607 : #ifdef HAVE_FALLOCATE
608 : "fallocate()"
609 : #else
610 : #ifdef HAVE_POSIX_FALLOCATE
611 : "posix_fallocate()"
612 : #else
613 : "ftruncate()"
614 : #endif
615 : #endif
616 : " failed\n", path, old_address, old_size, *new_size);
617 : /* extending failed:
618 : * free any disk space
619 : * allocated in the
620 : * process */
621 : if (ftruncate(fd, (off_t) old_size) < 0)
622 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): ftruncate() failed\n", path, old_address, old_size, *new_size);
623 : close(fd);
624 : return NULL;
625 : }
626 : p = mmap(NULL, *new_size, prot, flags,
627 : fd, 0);
628 : if (p != MAP_FAILED) {
629 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
630 : munmap(old_address, old_size);
631 : VALGRIND_FREELIKE_BLOCK(old_address, 0);
632 : }
633 : }
634 : #endif /* HAVE_MREMAP */
635 : }
636 : }
637 : }
638 453 : if (p == MAP_FAILED)
639 0 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): p == MAP_FAILED\n", path?path:"NULL", old_address, old_size, *new_size);
640 453 : if (fd >= 0)
641 453 : close(fd);
642 453 : return p == MAP_FAILED ? NULL : p;
643 : }
644 :
645 : int
646 0 : MT_msync(void *p, size_t len)
647 : {
648 0 : int ret = msync(p, len, MS_SYNC);
649 :
650 0 : if (ret < 0)
651 0 : GDKsyserror("msync failed\n");
652 0 : return ret;
653 : }
654 :
655 : bool
656 27068722 : MT_path_absolute(const char *pathname)
657 : {
658 27068722 : return (*pathname == DIR_SEP);
659 : }
660 :
661 : #ifdef HAVE_DLFCN_H
662 : # include <dlfcn.h>
663 : #endif
664 :
665 : void *
666 0 : mdlopen(const char *library, int mode)
667 : {
668 0 : (void)library; /* Not used because of MacOs not handling dlopen on linked library */
669 0 : return dlopen(NULL, mode);
670 : }
671 :
672 : #else /* WIN32 native */
673 :
674 : #ifndef BUFSIZ
675 : #define BUFSIZ 1024
676 : #endif
677 :
678 : #undef _errno
679 :
680 : #include <windows.h>
681 :
682 : #ifdef _MSC_VER
683 : #include <io.h>
684 : #endif /* _MSC_VER */
685 : #include <Psapi.h>
686 :
687 : #define MT_SMALLBLOCK 256
688 :
689 : static LONG WINAPI
690 : MT_ignore_exceptions(struct _EXCEPTION_POINTERS *ExceptionInfo)
691 : {
692 : (void) ExceptionInfo;
693 : return EXCEPTION_EXECUTE_HANDLER;
694 : }
695 :
696 : void
697 : MT_init_posix(void)
698 : {
699 : SetUnhandledExceptionFilter(MT_ignore_exceptions);
700 : }
701 :
702 : size_t
703 : MT_getrss(void)
704 : {
705 : PROCESS_MEMORY_COUNTERS ctr;
706 : if (GetProcessMemoryInfo(GetCurrentProcess(), &ctr, sizeof(ctr)))
707 : return ctr.WorkingSetSize;
708 : return 0;
709 : }
710 :
711 : /* Windows mmap keeps a global list of base addresses for complex
712 : * (remapped) memory maps the reason is that each remapped segment
713 : * needs to be unmapped separately in the end. */
714 :
715 : void *
716 : MT_mmap(const char *path, int mode, size_t len)
717 : {
718 : DWORD mode0 = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
719 : DWORD mode1 = FILE_SHARE_READ | FILE_SHARE_WRITE;
720 : DWORD mode2 = mode & MMAP_ADVISE;
721 : DWORD mode3 = PAGE_READONLY;
722 : int mode4 = FILE_MAP_READ;
723 : SECURITY_ATTRIBUTES sa;
724 : HANDLE h1, h2;
725 : void *ret;
726 : wchar_t *wpath = utf8toutf16(path);
727 : if (wpath == NULL)
728 : return NULL;
729 :
730 : static_assert(SIZEOF_WCHAR_T == 2, "wchar_t on Windows expected to be 2 bytes");
731 : if (mode & MMAP_WRITE) {
732 : mode0 |= FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA;
733 : }
734 : if (mode2 == MMAP_RANDOM || mode2 == MMAP_DONTNEED) {
735 : mode2 = FILE_FLAG_RANDOM_ACCESS;
736 : } else if (mode2 == MMAP_SEQUENTIAL || mode2 == MMAP_WILLNEED) {
737 : mode2 = FILE_FLAG_SEQUENTIAL_SCAN;
738 : } else {
739 : mode2 = FILE_FLAG_NO_BUFFERING;
740 : }
741 : if (mode & MMAP_SYNC) {
742 : mode2 |= FILE_FLAG_WRITE_THROUGH;
743 : }
744 : if (mode & MMAP_COPY) {
745 : mode3 = PAGE_WRITECOPY;
746 : mode4 = FILE_MAP_COPY;
747 : } else if (mode & MMAP_WRITE) {
748 : mode3 = PAGE_READWRITE;
749 : mode4 = FILE_MAP_WRITE;
750 : }
751 : mode2 |= FILE_ATTRIBUTE_NOT_CONTENT_INDEXED;
752 : sa.nLength = sizeof(SECURITY_ATTRIBUTES);
753 : sa.bInheritHandle = TRUE;
754 : sa.lpSecurityDescriptor = 0;
755 :
756 : h1 = CreateFileW(wpath, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
757 : if (h1 == INVALID_HANDLE_VALUE) {
758 : (void) SetFileAttributesW(wpath, FILE_ATTRIBUTE_NORMAL);
759 : h1 = CreateFileW(wpath, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
760 : if (h1 == INVALID_HANDLE_VALUE) {
761 : free(wpath);
762 : GDKwinerror("CreateFile('%s', %lu, %lu, &sa, %lu, %lu, NULL) failed\n",
763 : path, (unsigned long) mode0, (unsigned long) mode1, (unsigned long) OPEN_ALWAYS, (unsigned long) mode2);
764 : return NULL;
765 : }
766 : }
767 : free(wpath);
768 :
769 : h2 = CreateFileMapping(h1, &sa, mode3, (DWORD) (((__int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)), (DWORD) (len & LL_CONSTANT(0xFFFFFFFF)), NULL);
770 : if (h2 == NULL) {
771 : GDKwinerror("CreateFileMapping(%p, &sa, %lu, %lu, %lu, NULL) failed\n",
772 : h1, (unsigned long) mode3,
773 : (unsigned long) (((unsigned __int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)),
774 : (unsigned long) (len & LL_CONSTANT(0xFFFFFFFF)));
775 : CloseHandle(h1);
776 : return NULL;
777 : }
778 : CloseHandle(h1);
779 :
780 : ret = MapViewOfFileEx(h2, mode4, (DWORD) 0, (DWORD) 0, len, NULL);
781 : if (ret == NULL)
782 : errno = winerror(GetLastError());
783 : CloseHandle(h2);
784 :
785 : return ret;
786 : }
787 :
788 : int
789 : MT_munmap(void *p, size_t dummy)
790 : {
791 : int ret;
792 :
793 : (void) dummy;
794 : /* Windows' UnmapViewOfFile returns success!=0, error== 0,
795 : * while Unix's munmap returns success==0, error==-1. */
796 : ret = UnmapViewOfFile(p);
797 : if (ret == 0) {
798 : GDKwinerror("UnmapViewOfFile failed\n");
799 : return -1;
800 : }
801 : return 0;
802 : }
803 :
804 : void *
805 : MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
806 : {
807 : void *p;
808 :
809 : /* doesn't make sense for us to extend read-only memory map */
810 : assert(mode & MMAP_WRITABLE);
811 :
812 : /* round up to multiple of page size */
813 : *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
814 :
815 : if (old_size >= *new_size) {
816 : *new_size = old_size;
817 : return old_address; /* don't bother shrinking */
818 : }
819 : if (GDKextend(path, *new_size) != GDK_SUCCEED) {
820 : TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): GDKextend() failed\n", path?path:"NULL", old_address, old_size, *new_size);
821 : return NULL;
822 : }
823 : if (path && !(mode & MMAP_COPY))
824 : MT_munmap(old_address, old_size);
825 : p = MT_mmap(path, mode, *new_size);
826 : if (p != NULL && (path == NULL || (mode & MMAP_COPY))) {
827 : memcpy(p, old_address, old_size);
828 : MT_munmap(old_address, old_size);
829 : }
830 :
831 : if (p == NULL)
832 : TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): p == NULL\n", path?path:"NULL", old_address, old_size, *new_size);
833 : return p;
834 : }
835 :
836 : int
837 : MT_msync(void *p, size_t len)
838 : {
839 : int ret;
840 :
841 : /* Windows' FlushViewOfFile returns success!=0, error== 0,
842 : * while Unix's munmap returns success==0, error==-1. */
843 : ret = FlushViewOfFile(p, len);
844 : if (ret == 0) {
845 : GDKwinerror("FlushViewOfFile failed\n");
846 : return -1;
847 : }
848 : return 0;
849 : }
850 :
851 : bool
852 : MT_path_absolute(const char *pathname)
853 : {
854 : /* drive letter, colon, directory separator */
855 : return (((('a' <= pathname[0] && pathname[0] <= 'z') ||
856 : ('A' <= pathname[0] && pathname[0] <= 'Z')) &&
857 : pathname[1] == ':' &&
858 : (pathname[2] == '/' || pathname[2] == '\\')) ||
859 : (pathname[0] == '\\')); // && pathname[1] == '\\'));
860 : }
861 :
862 : #ifndef HAVE_GETTIMEOFDAY
863 : static int nodays[12] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
864 :
865 : #define LEAPYEAR(y) ((((y)%4)==0 && ((y)%100)!=0) || ((y)%400)==0)
866 : #define NODAYS(m,y) (((m)!=2)?nodays[(m)-1]:LEAPYEAR(y)?29:28)
867 :
868 : int
869 : gettimeofday(struct timeval *tv, int *ignore_zone)
870 : {
871 : unsigned int year, day, month;
872 : SYSTEMTIME st;
873 :
874 : (void) ignore_zone;
875 : GetSystemTime(&st);
876 : day = 0;
877 : for (year = 1970; year < st.wYear; year++)
878 : day += LEAPYEAR(year) ? 366 : 365;
879 :
880 : for (month = 1; month < st.wMonth; month++)
881 : day += NODAYS(month, st.wYear);
882 :
883 : day += st.wDay;
884 : tv->tv_sec = 60 * (day * 24 * 60 + st.wMinute) + st.wSecond;
885 : tv->tv_usec = 1000 * st.wMilliseconds;
886 : return 0;
887 : }
888 : #endif
889 :
890 : void *
891 : mdlopen(const char *file, int mode)
892 : {
893 : return dlopen(file, mode);
894 : }
895 :
896 : void *
897 : dlopen(const char *file, int mode)
898 : {
899 : (void) mode;
900 : if (file != NULL) {
901 : wchar_t *wfile = utf8toutf16(file);
902 : if (wfile == NULL)
903 : return NULL;
904 : void *ret = LoadLibraryW(wfile);
905 : free(wfile);
906 : return ret;
907 : }
908 : return GetModuleHandle(NULL);
909 : }
910 :
911 : int
912 : dlclose(void *handle)
913 : {
914 : if (handle != NULL) {
915 : return FreeLibrary((HINSTANCE) handle);
916 : }
917 : return -1;
918 : }
919 :
920 : void *
921 : dlsym(void *handle, const char *name)
922 : {
923 : if (handle != NULL) {
924 : return (void *) GetProcAddress((HINSTANCE) handle, name);
925 : }
926 : return NULL;
927 : }
928 :
929 : char *
930 : dlerror(void)
931 : {
932 : static char msg[1024];
933 :
934 : FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), 0, msg, sizeof(msg), NULL);
935 : return msg;
936 : }
937 : #endif
938 :
939 : void
940 265568 : MT_sleep_ms(unsigned int ms)
941 : {
942 : #ifdef NATIVE_WIN32
943 : Sleep(ms);
944 : #else
945 : #ifdef HAVE_NANOSLEEP
946 265568 : (void) nanosleep(&(struct timespec) {.tv_sec = ms / 1000,
947 265568 : .tv_nsec = ms == 1 ? 1000 : (long) (ms % 1000) * 1000000,},
948 : NULL);
949 : #else
950 : (void) select(0, NULL, NULL, NULL,
951 : &(struct timeval) {.tv_sec = ms / 1000,
952 : .tv_usec = ms == 1 ? 1 : (ms % 1000) * 1000,});
953 : #endif
954 : #endif
955 265395 : }
956 :
957 : #if !defined(HAVE_LOCALTIME_R) || !defined(HAVE_GMTIME_R) || !defined(HAVE_ASCTIME_R) || !defined(HAVE_CTIME_R)
958 : static MT_Lock timelock = MT_LOCK_INITIALIZER(timelock);
959 : #endif
960 :
961 : #ifndef HAVE_LOCALTIME_R
962 : struct tm *
963 : localtime_r(const time_t *restrict timep, struct tm *restrict result)
964 : {
965 : struct tm *tmp;
966 : MT_lock_set(&timelock);
967 : tmp = localtime(timep);
968 : if (tmp)
969 : *result = *tmp;
970 : MT_lock_unset(&timelock);
971 : return tmp ? result : NULL;
972 : }
973 : #endif
974 :
975 : #ifndef HAVE_GMTIME_R
976 : struct tm *
977 : gmtime_r(const time_t *restrict timep, struct tm *restrict result)
978 : {
979 : struct tm *tmp;
980 : MT_lock_set(&timelock);
981 : tmp = gmtime(timep);
982 : if (tmp)
983 : *result = *tmp;
984 : MT_lock_unset(&timelock);
985 : return tmp ? result : NULL;
986 : }
987 : #endif
988 :
989 : #ifndef HAVE_ASCTIME_R
990 : char *
991 : asctime_r(const struct tm *restrict tm, char *restrict buf)
992 : {
993 : char *tmp;
994 : MT_lock_set(&timelock);
995 : tmp = asctime(tm);
996 : if (tmp)
997 : strcpy(buf, tmp);
998 : MT_lock_unset(&timelock);
999 : return tmp ? buf : NULL;
1000 : }
1001 : #endif
1002 :
1003 : #ifndef HAVE_CTIME_R
1004 : char *
1005 : ctime_r(const time_t *restrict t, char *restrict buf)
1006 : {
1007 : char *tmp;
1008 : MT_lock_set(&timelock);
1009 : tmp = ctime(t);
1010 : if (tmp)
1011 : strcpy(buf, tmp);
1012 : MT_lock_unset(&timelock);
1013 : return tmp ? buf : NULL;
1014 : }
1015 : #endif
1016 :
1017 : #if !defined(HAVE_STRERROR_R) && !defined(HAVE_STRERROR_S)
1018 : static MT_Lock strerrlock = MT_LOCK_INITIALIZER(strerrlock);
1019 :
1020 : int
1021 : strerror_r(int errnum, char *buf, size_t buflen)
1022 : {
1023 : char *msg;
1024 : MT_lock_set(&strerrlock);
1025 : msg = strerror(errnum);
1026 : strcpy_len(buf, msg, buflen);
1027 : MT_lock_unset(&strerrlock);
1028 : return 0;
1029 : }
1030 : #endif
|