I ran the msqldump operation again and the server crashed at exactly the
same place, which is to say, about 2/3 through the table, after
outputting exactly the same number of lines.
Some more information about the crash. This is a backtrace:
#0 0x0000003f27da4577 in strCmp () from /lib64/libbat.so.11
#1 0x0000003f2773a8dc in TABLEToutput_file () from /lib64/libmonetdb5.so.17
#2 0x00007f212dbe8168 in mvc_export_table.isra () from
/usr/lib64/monetdb5/lib_sql.so
#3 0x00007f212dbed587 in mvc_export_chunk () from
/usr/lib64/monetdb5/lib_sql.so
#4 0x00007f212dbc955d in SQLparser () from /usr/lib64/monetdb5/lib_sql.so
#5 0x0000003f27661649 in runScenarioBody () from /lib64/libmonetdb5.so.17
#6 0x0000003f2766226d in runScenario () from /lib64/libmonetdb5.so.17
#7 0x0000003f27662340 in MSserveClient () from /lib64/libmonetdb5.so.17
#8 0x0000003f27e0fb1f in thread_starter () from /lib64/libbat.so.11
#9 0x00000035cb207ee5 in start_thread () from /lib64/libpthread.so.0
#10 0x00000035caaf4b8d in clone () from /lib64/libc.so.6
Register contents:
rax 0x0 0
rbx 0xbb9a3067f37e0 3300334583625696
rcx 0x3f2816b380 271255516032
rdx 0x80 128
rsi 0xbb9a3067f37e0 3300334583625696
rdi 0x3f27f44880 271253260416
rbp 0xc 0xc
rsp 0x7f212cc6ca18 0x7f212cc6ca18
r8 0x3f2816aae0 271255513824
r9 0x8a0 2208
r10 0x3f27f44880 271253260416
r11 0x35cab74bf0 231034276848
r12 0x4 4
r13 0x2000 8192
r14 0x7f2120576b68 139780253248360
r15 0x7f2120578060 139780253253728
rip 0x3f27da4577 0x3f27da4577
eflags 0x10246 [ PF ZF IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
st0 0 (raw 0x00000000000000000000)
st1 0 (raw 0x00000000000000000000)
st2 0 (raw 0x00000000000000000000)
st3 0 (raw 0x00000000000000000000)
st4 0 (raw 0x00000000000000000000)
st5 0 (raw 0x00000000000000000000)
st6 0 (raw 0x00000000000000000000)
st7 0 (raw 0x00000000000000000000)
fctrl 0x37f 895
fstat 0x0 0
ftag 0xffff 65535
fiseg 0x0 0
fioff 0x0 0
foseg 0x0 0
fooff 0x0 0
fop 0x0 0
mxcsr 0x1fa0 [ PE IM DM ZM OM UM PM ]
This is disassembly of the code from strCmp(), though of course the
origin of the problem will probably be higher in the call chain:
0x0000003f27da4500 <+0>: test %rdi,%rdi
0x0000003f27da4503 <+3>: je 0x3f27da4570
0x0000003f27da4505 <+5>: movzbl (%rdi),%edx
0x0000003f27da4508 <+8>: cmp $0x80,%dl
0x0000003f27da450b <+11>: je 0x3f27da4570
0x0000003f27da450d <+13>: test %rsi,%rsi
0x0000003f27da4510 <+16>: je 0x3f27da4580
0x0000003f27da4512 <+18>: movzbl (%rsi),%ecx
0x0000003f27da4515 <+21>: mov $0x1,%eax
0x0000003f27da451a <+26>: cmp $0x80,%cl
0x0000003f27da451d <+29>: je 0x3f27da4568
0x0000003f27da451f <+31>: cmp %dl,%cl
0x0000003f27da4521 <+33>: mov $0xffffffff,%eax
0x0000003f27da4526 <+38>: ja 0x3f27da4568
0x0000003f27da4528 <+40>: mov $0x1,%eax
0x0000003f27da452d <+45>: jb 0x3f27da4568
0x0000003f27da452f <+47>: jne 0x3f27da4568
0x0000003f27da4531 <+49>: test %cl,%cl
0x0000003f27da4533 <+51>: jne 0x3f27da4544
0x0000003f27da4535 <+53>: jmp 0x3f27da4560
0x0000003f27da4537 <+55>: nopw 0x0(%rax,%rax,1)
0x0000003f27da4540 <+64>: test %al,%al
0x0000003f27da4542 <+66>: je 0x3f27da4560
0x0000003f27da4544 <+68>: add $0x1,%rdi
0x0000003f27da4548 <+72>: add $0x1,%rsi
0x0000003f27da454c <+76>: movzbl (%rdi),%eax
0x0000003f27da454f <+79>: cmp (%rsi),%al
0x0000003f27da4551 <+81>: je 0x3f27da4540
0x0000003f27da4553 <+83>: sbb %eax,%eax
0x0000003f27da4555 <+85>: or $0x1,%eax
0x0000003f27da4558 <+88>: retq
0x0000003f27da4559 <+89>: nopl 0x0(%rax)
0x0000003f27da4560 <+96>: xor %eax,%eax
0x0000003f27da4562 <+98>: nopw 0x0(%rax,%rax,1)
0x0000003f27da4568 <+104>: repz retq
0x0000003f27da456a <+106>: nopw 0x0(%rax,%rax,1)
0x0000003f27da4570 <+112>: test %rsi,%rsi
0x0000003f27da4573 <+115>: je 0x3f27da4560
0x0000003f27da4575 <+117>: xor %eax,%eax
=> 0x0000003f27da4577 <+119>: cmpb $0x80,(%rsi)
0x0000003f27da457a <+122>: setne %al
0x0000003f27da457d <+125>: neg %eax
0x0000003f27da457f <+127>: retq
0x0000003f27da4580 <+128>: mov $0x1,%eax
0x0000003f27da4585 <+133>: retq
I'm not sure how MonetDB works its way through the table to produce the
msqldump output, so it's hard to guess which records it is working on at
the time, but the crash always occurs after writing 2,147,749,019
msqldump records. Just using SELECT on, say, the next 20,000 records
after that displays them with no problem, so it seems like the issue may
be specific to msqldump.
This is what appeared in the system
log at the time of the crash:
mserver5[20195] general protection ip:3f27da4577 sp:7f212cc6ca18 error:0
in libbat.so.11.0.3[3f27c00000+369000]
mserver5[20195] general protection ip:3f27da4577 sp:7f212cc6ca18 error:0
in libbat.so.11.0.3[3f27c00000+369000]
The machine is a 64-bit system with 32GB of ECC memory. There is no
indication of a memory fault.
What I thought I'd try to do tonight is recompile the server with debug
options and run it in valgrind, but I'm not sure what the best build
options are. If you have any ideas for what I could/should do to help
find this problem, just let me know.
Tim