Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * @a Niels Nes, Peter Boncz
15 : * @* System Independent Layer
16 : *
17 : * GDK is built on Posix. Exceptions are made for memory mapped files
18 : * and anonymous virtual memory, for which somewhat higher-level
19 : * functions are defined here. Most of this file concerns itself with
20 : * emulation of Posix functionality on the WIN32 native platform.
21 : */
22 : #include "monetdb_config.h"
23 : #include "gdk.h" /* includes gdk_posix.h */
24 : #include "gdk_private.h"
25 : #include "mutils.h"
26 : #include <unistd.h>
27 : #include <string.h> /* strncpy */
28 :
29 : #ifdef HAVE_FCNTL_H
30 : # include <fcntl.h>
31 : #endif
32 : #ifdef HAVE_PROCFS_H
33 : # include <procfs.h>
34 : #endif
35 : #ifdef HAVE_MACH_TASK_H
36 : # include <mach/task.h>
37 : #endif
38 : #ifdef HAVE_MACH_MACH_INIT_H
39 : # include <mach/mach_init.h>
40 : #endif
41 : #if defined(HAVE_KVM_H)
42 : # include <kvm.h>
43 : # include <sys/param.h>
44 : # include <sys/sysctl.h>
45 : # include <sys/user.h>
46 : #endif
47 :
48 : #if defined(__GNUC__) && defined(HAVE_VALGRIND)
49 : #include <valgrind.h>
50 : #else
51 : #define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)
52 : #define VALGRIND_FREELIKE_BLOCK(addr, rzB)
53 : #define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)
54 : #endif
55 :
56 : #ifndef MAP_NORESERVE
57 : # define MAP_NORESERVE MAP_PRIVATE
58 : #endif
59 : #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
60 : #define MAP_ANONYMOUS MAP_ANON
61 : #endif
62 :
63 : #define MMAP_ADVISE 7
64 : #define MMAP_WRITABLE (MMAP_WRITE|MMAP_COPY)
65 :
66 : #ifndef O_CLOEXEC
67 : #ifdef _O_NOINHERIT
68 : #define O_CLOEXEC _O_NOINHERIT /* Windows */
69 : #else
70 : #define O_CLOEXEC 0
71 : #endif
72 : #endif
73 :
74 : /* Crude VM buffer management that keep a list of all memory mapped
75 : * regions.
76 : *
77 : * a.k.a. "helping stupid VM implementations that ignore VM advice"
78 : *
79 : * The main goal is to be able to tell the OS to please stop buffering
80 : * all memory mapped pages when under pressure. A major problem is
81 : * materialization of large results in newly created memory mapped
82 : * files. Operating systems tend to cache all dirty pages, such that
83 : * when memory is out, all pages are dirty and cannot be unloaded
84 : * quickly. The VM panic occurs and comatose OS states may be
85 : * observed. This is in spite of our use of
86 : * madvise(MADV_SEQUENTIAL). That is; we would want that the OS drops
87 : * pages after we've passed them. That does not happen; pages are
88 : * retained and pollute the buffer cache.
89 : *
90 : * Regrettably, at this level, we don't know anything about how Monet
91 : * is using the mmapped regions. Monet code is totally oblivious of
92 : * any I/O; that's why it is so easy to create CPU efficient code in
93 : * Monet.
94 : *
95 : * The current solution focuses on large writable maps. These often
96 : * represent newly created BATs, that are the result of some (running)
97 : * operator. We assume two things here:
98 : * - the BAT is created in sequential fashion (always almost true)
99 : * - afterwards, this BAT is used in sequential fashion (often true)
100 : *
101 : * A VMtrim thread keeps an eye on the RSS (memory pressure) and large
102 : * writable memory maps. If RSS approaches mem_maxsize(), it starts to
103 : * *worry*, and starts to write dirty data from these writable maps to
104 : * disk in 128MB tiles. So, if memory pressure rises further in the
105 : * near future, the OS has some option to release memory pages cheaply
106 : * (i.e. without needing I/O). This is also done explicitly by the
107 : * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS
108 : * to release pages. The reason is that Linux is not smart enough to
109 : * do even this. Anyway..
110 : *
111 : * The way to free pages explicitly in Linux is to call
112 : * posix_fadvise(..,MADV_DONTNEED). Particularly,
113 : * posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and
114 : * documented doesn't work on Linux. But we do both posix_madvise and
115 : * posix_fadvise, so on other unix systems that don't support
116 : * posix_fadvise, posix_madvise still might work. On Windows, to our
117 : * knowledge, there is no way to tell it stop buffering a memory
118 : * mapped region. msync (FlushViewOfFile) does work, though. So let's
119 : * hope the VM paging algorithm behaves better than Linux which just
120 : * runs off the cliff and if MonetDB does not prevent RSS from being
121 : * too high, enters coma.
122 : *
123 : * We will only be able to sensibly test this on Windows64. On
124 : * Windows32, mmap sizes do not significantly exceed RAM sizes so
125 : * MonetDB swapping actually will not happen (of course, you've got
126 : * this nasty problem of VM fragemntation and failing mmaps instead).
127 : *
128 : * In principle, page tiles are saved sequentially, and behind it, but
129 : * never overtaking it, is an "unload-cursor" that frees the pages if
130 : * that is needed to keep RSS down. There is a tweak in the
131 : * algorithm, that re-sets the unload-cursor if it seems that all
132 : * tiles to the end have been saved (whether a tile is actually saved
133 : * is determined by timing the sync action). This means that the
134 : * producing operator is ready creating the BAT, and we assume it is
135 : * going to be used sequentially afterwards. In that case, we should
136 : * start unloading right after the 'read-cursor', that is, from the
137 : * start.
138 : *
139 : * EXAMPLE
140 : * D = dirty tile
141 : * s = saved tile (i.e. clean)
142 : * u = unloaded tile
143 : * L = tile that is being loaded
144 : *
145 : * +--> operator produces BAT
146 : * (1) DDDDDD|......................................| end of reserved mmap
147 : * ____|RSS
148 : * |
149 : * | at 3/4 of RSS consumed we start to worry
150 : * +--> operator produces BAT
151 : * (2) DDDDDDDDDDDDDDDD|............................|
152 : * s<----------------------------- VM backwards save thread
153 : * |
154 : * + first tile of which saving costs anything
155 : *
156 : * +--> operator produces BAT
157 : * (3) DDDDDDDDDDDDDDDss|D|.........................|
158 : * VM-thread save ->|
159 : *
160 : * When the RSS target is exceeded, we start unloading tiles..
161 : *
162 : * +--> VM-thread unload starts at *second* 's'
163 : * |
164 : * | +--> operator produces BAT
165 : * (4) DDDDDDDDDDDDDDDsus|DD|........................|
166 : * VM-thread save -->| | RSS = Full!
167 : *
168 : * +-- 0 => save costs nothing!!
169 : * VM-thread save ------------->| assume bat complete
170 : * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
171 : * |<-------- re-set unload cursor
172 : * +--- first tile was not unloaded.
173 : *
174 : * later.. some other operator sequentially reads the bat
175 : * first part is 'D', that is, nicely cached.
176 : *
177 : * ---read------->|
178 : * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
179 : *
180 : * now we're hitting the unloaded region. the query becomes
181 : * I/O read bound here (typically 20% CPU utilization).
182 : *
183 : * ---read-------->|
184 : * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................|
185 : * / \
186 : * unload cursor load cursor
187 : *
188 : * ---read---------------->|
189 : * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
190 : * / \
191 : * unload cursor load cursor
192 : *
193 : * ---read--------------------->| done
194 : * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
195 : * ****
196 : * last part still cached
197 : *
198 : * note: if we would not have re-setted the unload cursor (5)
199 : * the last part would have been lost due to continuing
200 : * RSS pressure from the 'L' read-cursor.
201 : *
202 : * If multiple write-mmaps exist, we do unload-tile and save-tile
203 : * selection on a round-robin basis among them.
204 : *
205 : * Of course, this is a simple solution for simple cases only.
206 : * (a) if the bat is produced too fast, (or your disk is too slow)
207 : * RSS will exceeds its limit and Linux will go into swapping.
208 : * (b) if your data is not produced and read sequentially.
209 : * Examples are sorting or clustering on huge datasets.
210 : * (c) if RSS pressure is due to large read-maps, rather than
211 : * intermediate results.
212 : *
213 : * Two crude suggestions:
214 : * - If we are under RSS pressure without unloadable tiles and with
215 : * savable tiles, we should consider suspending *all* other threads
216 : * until we manage to unload a tile.
217 : * - if there are no savable tiles (or in case of read-only maps)
218 : * we could resort to saving and unloading random tiles.
219 : *
220 : * To do better, our BAT algorithms should provide even more detailed
221 : * advice on their access patterns, which may even consist of pointers
222 : * to the cursors (i.e. pointers to b->batBuns->free or the cursors
223 : * in radix-cluster), which an enhanced version of this thread might
224 : * take into account.
225 : *
226 : * [Kersten] The memory map table should be aligned to the number of
227 : * mapped files. In more recent applications, such as the SkyServer
228 : * this may be around 2000 BATs easily.
229 : */
230 :
231 : #ifdef HAVE_PTHREAD_H
232 : /* pthread.h on Windows includes config.h if HAVE_CONFIG_H is set */
233 : #undef HAVE_CONFIG_H
234 : #include <sched.h>
235 : #include <pthread.h>
236 : #endif
237 : #ifdef HAVE_SEMAPHORE_H
238 : #include <semaphore.h>
239 : #endif
240 :
241 : #ifndef NATIVE_WIN32
242 : #ifdef HAVE_POSIX_FADVISE
243 : #ifdef HAVE_UNAME
244 : #include <sys/utsname.h>
245 : #endif
246 : #endif
247 :
248 : void
249 343 : MT_init_posix(void)
250 : {
251 343 : }
252 :
253 : /* return RSS in bytes */
254 : size_t
255 0 : MT_getrss(void)
256 : {
257 : #if defined(HAVE_PROCFS_H) && defined(__sun__)
258 : /* retrieve RSS the Solaris way (2.6+) */
259 : int fd;
260 : psinfo_t psbuff;
261 :
262 : fd = open("/proc/self/psinfo", O_RDONLY | O_CLOEXEC);
263 : if (fd >= 0) {
264 : if (read(fd, &psbuff, sizeof(psbuff)) == sizeof(psbuff)) {
265 : close(fd);
266 : return psbuff.pr_rssize * 1024;
267 : }
268 : close(fd);
269 : }
270 : #elif defined(HAVE_TASK_INFO)
271 : /* Darwin/MACH call for process' RSS */
272 : task_t task = mach_task_self();
273 : struct task_basic_info_64 t_info;
274 : mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_64_COUNT;
275 :
276 : if (task_info(task, TASK_BASIC_INFO_64, (task_info_t)&t_info, &t_info_count) != KERN_INVALID_POLICY)
277 : return t_info.resident_size; /* bytes */
278 : #elif defined(HAVE_KVM_H)
279 : /* get RSS on FreeBSD and NetBSD */
280 : struct kinfo_proc *ki;
281 : int ski = 1;
282 : kvm_t *kd;
283 : size_t rss = 0;
284 :
285 : kd = kvm_open(NULL, "/dev/null", NULL, O_RDONLY, "kvm_open");
286 : if (kd != NULL) {
287 : ki = kvm_getprocs(kd, KERN_PROC_PID, getpid(), &ski);
288 : if (ki != NULL) {
289 : #ifdef __NetBSD__ /* should we use configure for this? */
290 : /* see bug 3217 */
291 : rss = ki->kp_eproc.e_vm.vm_rssize;
292 : #else
293 : rss = ki->ki_rssize;
294 : #endif
295 : kvm_close(kd);
296 :
297 : return rss * MT_pagesize();
298 : } else {
299 : kvm_close(kd);
300 : }
301 : }
302 : #elif defined(__linux__)
303 : /* get RSS on Linux */
304 0 : int fd;
305 :
306 0 : fd = open("/proc/self/statm", O_RDONLY | O_CLOEXEC);
307 0 : if (fd >= 0) {
308 0 : char buf[1024];
309 0 : ssize_t sz = read(fd, buf, sizeof(buf) - 1);
310 :
311 0 : close(fd);
312 0 : if (sz > 0) {
313 0 : buf[sz] = 0;
314 0 : long rss;
315 0 : if (sscanf(buf, "%*d %ld", &rss) >= 1)
316 0 : return (size_t) rss * MT_pagesize();
317 : }
318 : }
319 : #endif
320 : return 0;
321 : }
322 :
323 : void *
324 2087 : MT_mmap(const char *path, int mode, size_t len)
325 : {
326 2087 : int fd;
327 2087 : void *ret;
328 :
329 2087 : fd = open(path, O_CREAT | ((mode & MMAP_WRITE) ? O_RDWR : O_RDONLY) | O_CLOEXEC, MONETDB_MODE);
330 2098 : if (fd < 0) {
331 0 : GDKsyserror("open %s failed\n", path);
332 0 : return NULL;
333 : }
334 2098 : ret = mmap(NULL,
335 : len,
336 2098 : ((mode & MMAP_WRITABLE) ? PROT_WRITE : 0) | PROT_READ,
337 2098 : (mode & MMAP_COPY) ? (MAP_PRIVATE | MAP_NORESERVE) : MAP_SHARED,
338 : fd,
339 : 0);
340 2114 : if (ret == MAP_FAILED) {
341 0 : GDKsyserror("mmap(%s,%zu) failed\n", path, len);
342 0 : ret = NULL;
343 : } else {
344 2114 : VALGRIND_MALLOCLIKE_BLOCK(ret, len, 0, 1);
345 : }
346 2114 : close(fd);
347 2114 : return ret;
348 : }
349 :
350 : int
351 2110 : MT_munmap(void *p, size_t len)
352 : {
353 2110 : int ret = munmap(p, len);
354 :
355 2114 : if (ret < 0)
356 0 : GDKsyserror("munmap(%p,%zu) failed\n", p, len);
357 2114 : VALGRIND_FREELIKE_BLOCK(p, 0);
358 2114 : return ret;
359 : }
360 :
361 : /* expand or shrink a memory map (ala realloc).
362 : * the address returned may be different from the address going in.
363 : * in case of failure, the old address is still mapped and NULL is returned.
364 : */
365 : void *
366 465 : MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
367 : {
368 465 : void *p;
369 465 : int fd = -1;
370 465 : int flags = mode & MMAP_COPY ? MAP_PRIVATE : MAP_SHARED;
371 465 : int prot = PROT_WRITE | PROT_READ;
372 :
373 : #ifdef MAP_FIXED_NOREPLACE
374 465 : flags |= MAP_FIXED_NOREPLACE;
375 : #endif
376 : /* round up to multiple of page size */
377 465 : *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
378 :
379 : /* doesn't make sense for us to extend read-only memory map */
380 465 : assert(mode & MMAP_WRITABLE);
381 :
382 465 : if (*new_size < old_size) {
383 : #ifndef __COVERITY__ /* hide this from static code analyzer */
384 : /* shrink */
385 0 : VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
386 0 : if (munmap((char *) old_address + *new_size,
387 : old_size - *new_size) < 0) {
388 0 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): munmap() failed\n", path?path:"NULL", old_address, old_size, *new_size);
389 : /* even though the system call failed, we
390 : * don't need to propagate the error up: the
391 : * address should still work in the same way
392 : * as it did before */
393 0 : return old_address;
394 : }
395 0 : if (path && truncate(path, *new_size) < 0)
396 0 : GDKwarning("truncate of %s failed: %s\n",
397 : path, GDKstrerror(errno, (char[64]){0}, 64));
398 : #endif /* !__COVERITY__ */
399 0 : return old_address;
400 : }
401 465 : if (*new_size == old_size) {
402 : /* do nothing */
403 : return old_address;
404 : }
405 :
406 465 : if (!(mode & MMAP_COPY) && path != NULL) {
407 : /* "normal" memory map */
408 :
409 465 : if ((fd = open(path, O_RDWR | O_CLOEXEC)) < 0) {
410 0 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): open failed\n",
411 : path, old_address, old_size, *new_size);
412 0 : return NULL;
413 : }
414 465 : if (GDKextendf(fd, *new_size, path) != GDK_SUCCEED) {
415 0 : close(fd);
416 0 : TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): GDKextendf() failed\n", path, old_address, old_size, *new_size);
417 0 : return NULL;
418 : }
419 : #ifdef HAVE_MREMAP
420 : /* on Linux it's easy */
421 465 : p = mremap(old_address, old_size, *new_size, MREMAP_MAYMOVE);
422 : #ifdef HAVE_VALGRIND
423 : if (p != MAP_FAILED) {
424 : if (p == old_address) {
425 : VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
426 : } else {
427 : VALGRIND_FREELIKE_BLOCK(old_address, 0);
428 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
429 : }
430 : }
431 : #endif
432 : #else
433 : /* try to map extension at end of current map */
434 : p = mmap((char *) old_address + old_size, *new_size - old_size,
435 : prot, flags, fd, old_size);
436 : /* if it failed and MAP_FIXED_NOREPLACE is not defined,
437 : * there is no point trying a full mmap: that too won't
438 : * fit either (if MAP_FIXED_NOREPLACE, only relevant
439 : * failure is with EEXIST) */
440 : if (p != MAP_FAILED || errno == EEXIST) {
441 : if (p == (char *) old_address + old_size) {
442 : /* we got the requested address, make
443 : * sure we return the correct (old)
444 : * address */
445 : VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
446 : p = old_address;
447 : } else {
448 : /* we got some other address: discard
449 : * it and make full mmap */
450 : if (p != MAP_FAILED &&
451 : munmap(p, *new_size - old_size) < 0)
452 : GDKsyserror("munmap");
453 : #ifdef NO_MMAP_ALIASING
454 : if (msync(old_address, old_size, MS_SYNC) < 0)
455 : GDKsyserror("msync");
456 : #endif
457 : /* first create full mmap, then, if
458 : * successful, remove old mmap */
459 : #ifdef MAP_FIXED_NOREPLACE
460 : flags &= ~MAP_FIXED_NOREPLACE;
461 : #endif
462 : p = mmap(NULL, *new_size, prot, flags, fd, 0);
463 : if (p != MAP_FAILED) {
464 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
465 : if (munmap(old_address, old_size) < 0)
466 : GDKsyserror("munmap");
467 : VALGRIND_FREELIKE_BLOCK(old_address, 0);
468 : }
469 : }
470 : }
471 : #endif /* HAVE_MREMAP */
472 : } else {
473 : /* "copy-on-write" or "anonymous" memory map */
474 : #ifdef MAP_ANONYMOUS
475 0 : flags |= MAP_ANONYMOUS;
476 : #else
477 : if ((fd = open("/dev/zero", O_RDWR | O_CLOEXEC)) < 0) {
478 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): "
479 : "open('/dev/zero') failed\n",
480 : path ? path : "NULL", old_address,
481 : old_size, *new_size);
482 : return NULL;
483 : }
484 : #endif
485 : /* try to map an anonymous area as extent to the
486 : * current map */
487 0 : p = mmap((char *) old_address + old_size, *new_size - old_size,
488 : prot, flags, fd, 0);
489 : /* no point trying a full map if this didn't work:
490 : * there isn't enough space */
491 0 : if (p != MAP_FAILED || errno == EEXIST) {
492 0 : if (p == (char *) old_address + old_size) {
493 : /* we got the requested address, make
494 : * sure we return the correct (old)
495 : * address */
496 : VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
497 : p = old_address;
498 : } else {
499 : /* we got some other address: discard
500 : * it and make full mmap */
501 0 : if (p != MAP_FAILED &&
502 0 : munmap(p, *new_size - old_size) < 0)
503 0 : GDKsyserror("munmap");
504 : #ifdef MAP_FIXED_NOREPLACE
505 0 : flags &= ~MAP_FIXED_NOREPLACE;
506 : #endif
507 : #ifdef HAVE_MREMAP
508 : /* first get an area large enough for
509 : * *new_size */
510 0 : p = mmap(NULL, *new_size, prot, flags, fd, 0);
511 0 : if (p != MAP_FAILED) {
512 : /* then overlay old mmap over new */
513 0 : void *q;
514 :
515 0 : q = mremap(old_address, old_size,
516 : old_size,
517 : MREMAP_FIXED | MREMAP_MAYMOVE,
518 : p);
519 0 : assert(q == p || q == MAP_FAILED);
520 0 : if (q == MAP_FAILED) {
521 0 : int e = errno;
522 : /* we didn't expect this... */
523 0 : if (munmap(p, *new_size) < 0)
524 0 : GDKsyserror("munmap");
525 0 : p = MAP_FAILED;
526 0 : errno = e;
527 : }
528 : #ifdef HAVE_VALGRIND
529 : else {
530 : VALGRIND_FREELIKE_BLOCK(old_size, 0);
531 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
532 : }
533 : #endif
534 : }
535 : #else
536 : p = MAP_FAILED;
537 : if (path == NULL ||
538 : *new_size <= GDK_mmap_minsize_persistent) {
539 : /* size not too big yet or
540 : * anonymous, try to make new
541 : * anonymous mmap and copy
542 : * data over */
543 : p = mmap(NULL, *new_size, prot, flags,
544 : fd, 0);
545 : if (p != MAP_FAILED) {
546 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 0);
547 : memcpy(p, old_address,
548 : old_size);
549 : munmap(old_address, old_size);
550 : VALGRIND_FREELIKE_BLOCK(old_address, 0);
551 : }
552 : /* if it failed, try alternative */
553 : }
554 : if (p == MAP_FAILED && path != NULL) {
555 : /* write data to disk, then
556 : * mmap it to new address */
557 : if (fd >= 0)
558 : close(fd);
559 : fd = -1;
560 : p = malloc(strlen(path) + 5);
561 : if (p == NULL){
562 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): fd < 0\n", path, old_address, old_size, *new_size);
563 : return NULL;
564 : }
565 :
566 : strcat(strcpy(p, path), ".tmp");
567 : fd = open(p, O_RDWR | O_CREAT | O_CLOEXEC,
568 : MONETDB_MODE);
569 : if (fd < 0) {
570 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): fd < 0\n", path, old_address, old_size, *new_size);
571 : free(p);
572 : return NULL;
573 : }
574 : free(p);
575 : if (write(fd, old_address,
576 : old_size) < 0 ||
577 : #ifdef HAVE_FALLOCATE
578 : /* prefer Linux-specific
579 : * fallocate over standard
580 : * posix_fallocate, since
581 : * glibc uses a rather
582 : * slow method of
583 : * allocating the file if
584 : * the file system doesn't
585 : * support the operation,
586 : * we just use ftruncate
587 : * in that case */
588 : (fallocate(fd, 0, (off_t) old_size, (off_t) *new_size - (off_t) old_size) < 0 && (errno != EOPNOTSUPP || ftruncate(fd, (off_t) *new_size) < 0))
589 : #else
590 : #ifdef HAVE_POSIX_FALLOCATE
591 : /* posix_fallocate returns
592 : * error number on
593 : * failure, not -1, and if
594 : * it returns EINVAL, the
595 : * underlying file system
596 : * may not support the
597 : * operation, so we then
598 : * need to try
599 : * ftruncate */
600 : ((errno = posix_fallocate(fd, (off_t) old_size, (off_t) *new_size - (off_t) old_size)) == EINVAL ? ftruncate(fd, (off_t) *new_size) < 0 : errno != 0)
601 : #else
602 : ftruncate(fd, (off_t) *new_size) < 0
603 : #endif
604 : #endif
605 : ) {
606 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): write() or "
607 : #ifdef HAVE_FALLOCATE
608 : "fallocate()"
609 : #else
610 : #ifdef HAVE_POSIX_FALLOCATE
611 : "posix_fallocate()"
612 : #else
613 : "ftruncate()"
614 : #endif
615 : #endif
616 : " failed\n", path, old_address, old_size, *new_size);
617 : /* extending failed:
618 : * free any disk space
619 : * allocated in the
620 : * process */
621 : if (ftruncate(fd, (off_t) old_size) < 0)
622 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): ftruncate() failed\n", path, old_address, old_size, *new_size);
623 : close(fd);
624 : return NULL;
625 : }
626 : p = mmap(NULL, *new_size, prot, flags,
627 : fd, 0);
628 : if (p != MAP_FAILED) {
629 : VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
630 : munmap(old_address, old_size);
631 : VALGRIND_FREELIKE_BLOCK(old_address, 0);
632 : }
633 : }
634 : #endif /* HAVE_MREMAP */
635 : }
636 : }
637 : }
638 465 : if (p == MAP_FAILED)
639 0 : GDKsyserror("MT_mremap(%s,%p,%zu,%zu): p == MAP_FAILED\n", path?path:"NULL", old_address, old_size, *new_size);
640 465 : if (fd >= 0)
641 465 : close(fd);
642 465 : return p == MAP_FAILED ? NULL : p;
643 : }
644 :
645 : int
646 99 : MT_msync(void *p, size_t len)
647 : {
648 99 : int ret = msync(p, len, MS_SYNC);
649 :
650 99 : if (ret < 0)
651 0 : GDKsyserror("msync failed\n");
652 99 : return ret;
653 : }
654 :
655 : bool
656 18779864 : MT_path_absolute(const char *pathname)
657 : {
658 18779864 : return (*pathname == DIR_SEP);
659 : }
660 :
661 : #ifdef HAVE_DLFCN_H
662 : # include <dlfcn.h>
663 : #endif
664 :
665 : void *
666 0 : mdlopen(const char *library, int mode)
667 : {
668 0 : (void)library; /* Not used because of MacOs not handling dlopen on linked library */
669 0 : return dlopen(NULL, mode);
670 : }
671 :
672 : #else /* WIN32 native */
673 :
674 : #ifndef BUFSIZ
675 : #define BUFSIZ 1024
676 : #endif
677 :
678 : #undef _errno
679 :
680 : #include <windows.h>
681 :
682 : #ifdef _MSC_VER
683 : #include <io.h>
684 : #endif /* _MSC_VER */
685 : #include <Psapi.h>
686 :
687 : #define MT_SMALLBLOCK 256
688 :
689 : static LONG WINAPI
690 : MT_ignore_exceptions(struct _EXCEPTION_POINTERS *ExceptionInfo)
691 : {
692 : (void) ExceptionInfo;
693 : return EXCEPTION_EXECUTE_HANDLER;
694 : }
695 :
696 : void
697 : MT_init_posix(void)
698 : {
699 : SetUnhandledExceptionFilter(MT_ignore_exceptions);
700 : }
701 :
702 : size_t
703 : MT_getrss(void)
704 : {
705 : PROCESS_MEMORY_COUNTERS ctr;
706 : if (GetProcessMemoryInfo(GetCurrentProcess(), &ctr, sizeof(ctr)))
707 : return ctr.WorkingSetSize;
708 : return 0;
709 : }
710 :
711 : /* Windows mmap keeps a global list of base addresses for complex
712 : * (remapped) memory maps the reason is that each remapped segment
713 : * needs to be unmapped separately in the end. */
714 :
715 : void *
716 : MT_mmap(const char *path, int mode, size_t len)
717 : {
718 : DWORD mode0 = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
719 : DWORD mode1 = FILE_SHARE_READ | FILE_SHARE_WRITE;
720 : DWORD mode2 = mode & MMAP_ADVISE;
721 : DWORD mode3 = PAGE_READONLY;
722 : int mode4 = FILE_MAP_READ;
723 : SECURITY_ATTRIBUTES sa;
724 : HANDLE h1, h2;
725 : void *ret;
726 : wchar_t *wpath = utf8towchar(path);
727 : if (wpath == NULL)
728 : return NULL;
729 :
730 : if (mode & MMAP_WRITE) {
731 : mode0 |= FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA;
732 : }
733 : if (mode2 == MMAP_RANDOM || mode2 == MMAP_DONTNEED) {
734 : mode2 = FILE_FLAG_RANDOM_ACCESS;
735 : } else if (mode2 == MMAP_SEQUENTIAL || mode2 == MMAP_WILLNEED) {
736 : mode2 = FILE_FLAG_SEQUENTIAL_SCAN;
737 : } else {
738 : mode2 = FILE_FLAG_NO_BUFFERING;
739 : }
740 : if (mode & MMAP_SYNC) {
741 : mode2 |= FILE_FLAG_WRITE_THROUGH;
742 : }
743 : if (mode & MMAP_COPY) {
744 : mode3 = PAGE_WRITECOPY;
745 : mode4 = FILE_MAP_COPY;
746 : } else if (mode & MMAP_WRITE) {
747 : mode3 = PAGE_READWRITE;
748 : mode4 = FILE_MAP_WRITE;
749 : }
750 : mode2 |= FILE_ATTRIBUTE_NOT_CONTENT_INDEXED;
751 : sa.nLength = sizeof(SECURITY_ATTRIBUTES);
752 : sa.bInheritHandle = TRUE;
753 : sa.lpSecurityDescriptor = 0;
754 :
755 : h1 = CreateFileW(wpath, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
756 : if (h1 == INVALID_HANDLE_VALUE) {
757 : (void) SetFileAttributesW(wpath, FILE_ATTRIBUTE_NORMAL);
758 : h1 = CreateFileW(wpath, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
759 : if (h1 == INVALID_HANDLE_VALUE) {
760 : free(wpath);
761 : GDKwinerror("CreateFile('%s', %lu, %lu, &sa, %lu, %lu, NULL) failed\n",
762 : path, (unsigned long) mode0, (unsigned long) mode1, (unsigned long) OPEN_ALWAYS, (unsigned long) mode2);
763 : return NULL;
764 : }
765 : }
766 : free(wpath);
767 :
768 : h2 = CreateFileMapping(h1, &sa, mode3, (DWORD) (((__int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)), (DWORD) (len & LL_CONSTANT(0xFFFFFFFF)), NULL);
769 : if (h2 == NULL) {
770 : GDKwinerror("CreateFileMapping(%p, &sa, %lu, %lu, %lu, NULL) failed\n",
771 : h1, (unsigned long) mode3,
772 : (unsigned long) (((unsigned __int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)),
773 : (unsigned long) (len & LL_CONSTANT(0xFFFFFFFF)));
774 : CloseHandle(h1);
775 : return NULL;
776 : }
777 : CloseHandle(h1);
778 :
779 : ret = MapViewOfFileEx(h2, mode4, (DWORD) 0, (DWORD) 0, len, NULL);
780 : if (ret == NULL)
781 : errno = winerror(GetLastError());
782 : CloseHandle(h2);
783 :
784 : return ret;
785 : }
786 :
787 : int
788 : MT_munmap(void *p, size_t dummy)
789 : {
790 : int ret;
791 :
792 : (void) dummy;
793 : /* Windows' UnmapViewOfFile returns success!=0, error== 0,
794 : * while Unix's munmap returns success==0, error==-1. */
795 : ret = UnmapViewOfFile(p);
796 : if (ret == 0) {
797 : GDKwinerror("UnmapViewOfFile failed\n");
798 : return -1;
799 : }
800 : return 0;
801 : }
802 :
803 : void *
804 : MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
805 : {
806 : void *p;
807 :
808 : /* doesn't make sense for us to extend read-only memory map */
809 : assert(mode & MMAP_WRITABLE);
810 :
811 : /* round up to multiple of page size */
812 : *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
813 :
814 : if (old_size >= *new_size) {
815 : *new_size = old_size;
816 : return old_address; /* don't bother shrinking */
817 : }
818 : if (GDKextend(path, *new_size) != GDK_SUCCEED) {
819 : TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): GDKextend() failed\n", path?path:"NULL", old_address, old_size, *new_size);
820 : return NULL;
821 : }
822 : if (path && !(mode & MMAP_COPY))
823 : MT_munmap(old_address, old_size);
824 : p = MT_mmap(path, mode, *new_size);
825 : if (p != NULL && (path == NULL || (mode & MMAP_COPY))) {
826 : memcpy(p, old_address, old_size);
827 : MT_munmap(old_address, old_size);
828 : }
829 :
830 : if (p == NULL)
831 : TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): p == NULL\n", path?path:"NULL", old_address, old_size, *new_size);
832 : return p;
833 : }
834 :
835 : int
836 : MT_msync(void *p, size_t len)
837 : {
838 : int ret;
839 :
840 : /* Windows' FlushViewOfFile returns success!=0, error== 0,
841 : * while Unix's munmap returns success==0, error==-1. */
842 : ret = FlushViewOfFile(p, len);
843 : if (ret == 0) {
844 : GDKwinerror("FlushViewOfFile failed\n");
845 : return -1;
846 : }
847 : return 0;
848 : }
849 :
850 : bool
851 : MT_path_absolute(const char *pathname)
852 : {
853 : /* drive letter, colon, directory separator */
854 : return (((('a' <= pathname[0] && pathname[0] <= 'z') ||
855 : ('A' <= pathname[0] && pathname[0] <= 'Z')) &&
856 : pathname[1] == ':' &&
857 : (pathname[2] == '/' || pathname[2] == '\\')) ||
858 : (pathname[0] == '\\')); // && pathname[1] == '\\'));
859 : }
860 :
861 : #ifndef HAVE_GETTIMEOFDAY
862 : static int nodays[12] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
863 :
864 : #define LEAPYEAR(y) ((((y)%4)==0 && ((y)%100)!=0) || ((y)%400)==0)
865 : #define NODAYS(m,y) (((m)!=2)?nodays[(m)-1]:LEAPYEAR(y)?29:28)
866 :
867 : int
868 : gettimeofday(struct timeval *tv, int *ignore_zone)
869 : {
870 : unsigned int year, day, month;
871 : SYSTEMTIME st;
872 :
873 : (void) ignore_zone;
874 : GetSystemTime(&st);
875 : day = 0;
876 : for (year = 1970; year < st.wYear; year++)
877 : day += LEAPYEAR(year) ? 366 : 365;
878 :
879 : for (month = 1; month < st.wMonth; month++)
880 : day += NODAYS(month, st.wYear);
881 :
882 : day += st.wDay;
883 : tv->tv_sec = 60 * (day * 24 * 60 + st.wMinute) + st.wSecond;
884 : tv->tv_usec = 1000 * st.wMilliseconds;
885 : return 0;
886 : }
887 : #endif
888 :
889 : void *
890 : mdlopen(const char *file, int mode)
891 : {
892 : return dlopen(file, mode);
893 : }
894 :
895 : void *
896 : dlopen(const char *file, int mode)
897 : {
898 : (void) mode;
899 : if (file != NULL) {
900 : wchar_t *wfile = utf8towchar(file);
901 : if (wfile == NULL)
902 : return NULL;
903 : void *ret = LoadLibraryW(wfile);
904 : free(wfile);
905 : return ret;
906 : }
907 : return GetModuleHandle(NULL);
908 : }
909 :
910 : int
911 : dlclose(void *handle)
912 : {
913 : if (handle != NULL) {
914 : return FreeLibrary((HINSTANCE) handle);
915 : }
916 : return -1;
917 : }
918 :
919 : void *
920 : dlsym(void *handle, const char *name)
921 : {
922 : if (handle != NULL) {
923 : return (void *) GetProcAddress((HINSTANCE) handle, name);
924 : }
925 : return NULL;
926 : }
927 :
928 : char *
929 : dlerror(void)
930 : {
931 : static char msg[1024];
932 :
933 : FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), 0, msg, sizeof(msg), NULL);
934 : return msg;
935 : }
936 : #endif
937 :
938 : void
939 226158 : MT_sleep_ms(unsigned int ms)
940 : {
941 : #ifdef NATIVE_WIN32
942 : Sleep(ms);
943 : #else
944 : #ifdef HAVE_NANOSLEEP
945 226158 : (void) nanosleep(&(struct timespec) {.tv_sec = ms / 1000,
946 226158 : .tv_nsec = ms == 1 ? 1000 : (long) (ms % 1000) * 1000000,},
947 : NULL);
948 : #else
949 : (void) select(0, NULL, NULL, NULL,
950 : &(struct timeval) {.tv_sec = ms / 1000,
951 : .tv_usec = ms == 1 ? 1 : (ms % 1000) * 1000,});
952 : #endif
953 : #endif
954 226039 : }
955 :
956 : #if !defined(HAVE_LOCALTIME_R) || !defined(HAVE_GMTIME_R) || !defined(HAVE_ASCTIME_R) || !defined(HAVE_CTIME_R)
957 : static MT_Lock timelock = MT_LOCK_INITIALIZER(timelock);
958 : #endif
959 :
960 : #ifndef HAVE_LOCALTIME_R
961 : struct tm *
962 : localtime_r(const time_t *restrict timep, struct tm *restrict result)
963 : {
964 : struct tm *tmp;
965 : MT_lock_set(&timelock);
966 : tmp = localtime(timep);
967 : if (tmp)
968 : *result = *tmp;
969 : MT_lock_unset(&timelock);
970 : return tmp ? result : NULL;
971 : }
972 : #endif
973 :
974 : #ifndef HAVE_GMTIME_R
975 : struct tm *
976 : gmtime_r(const time_t *restrict timep, struct tm *restrict result)
977 : {
978 : struct tm *tmp;
979 : MT_lock_set(&timelock);
980 : tmp = gmtime(timep);
981 : if (tmp)
982 : *result = *tmp;
983 : MT_lock_unset(&timelock);
984 : return tmp ? result : NULL;
985 : }
986 : #endif
987 :
988 : #ifndef HAVE_ASCTIME_R
989 : char *
990 : asctime_r(const struct tm *restrict tm, char *restrict buf)
991 : {
992 : char *tmp;
993 : MT_lock_set(&timelock);
994 : tmp = asctime(tm);
995 : if (tmp)
996 : strcpy(buf, tmp);
997 : MT_lock_unset(&timelock);
998 : return tmp ? buf : NULL;
999 : }
1000 : #endif
1001 :
1002 : #ifndef HAVE_CTIME_R
1003 : char *
1004 : ctime_r(const time_t *restrict t, char *restrict buf)
1005 : {
1006 : char *tmp;
1007 : MT_lock_set(&timelock);
1008 : tmp = ctime(t);
1009 : if (tmp)
1010 : strcpy(buf, tmp);
1011 : MT_lock_unset(&timelock);
1012 : return tmp ? buf : NULL;
1013 : }
1014 : #endif
1015 :
1016 : #ifndef HAVE_STRERROR_R
1017 : static MT_Lock strerrlock = MT_LOCK_INITIALIZER(strerrlock);
1018 :
1019 : int
1020 : strerror_r(int errnum, char *buf, size_t buflen)
1021 : {
1022 : char *msg;
1023 : MT_lock_set(&strerrlock);
1024 : msg = strerror(errnum);
1025 : strcpy_len(buf, msg, buflen);
1026 : MT_lock_unset(&strerrlock);
1027 : return 0;
1028 : }
1029 : #endif
|