More information on this multithreaded crash:
My application is starting up a six or seven service
threads. Each thread simulates a piece of robotic
hardware for simulation runs. Ruby fails sometimes
after starting just one thread, but on the next run it may start
five before crashing. It always fails just as a thread is first
started. It never has managed to start all threads without
segfaulting.
The failures occur in one of three places in vm.c
The failure site is random among the three run-to-run.
I have not been able to produce a simple test case,
but I now have a lot more information:
Here's 'C' backtrace from gdb for crash site #1:
(gdb) backtrace
#0 0x080f095d in lfp_set_special_cref (lfp=0xb7d52138, cref=0x0) at
vm.c:822
#1 0x080f03b8 in vm_invoke_proc (th=0x8452208, proc=0x84521a8,
self=3083694740, argc=2, argv=0x84521f8) at vm.c:616
#2 0x080fa271 in thread_start_func_2 (th=0x8452208,
stack_start=0xb768e3c0) at thread.c:317
#3 0x080f980e in thread_start_func_1 (th_ptr=0x8452208) at
thread_pthread.ci:163
#4 0xb7f594bb in start_thread () from /lib/libpthread.so.0
#5 0xb7e914de in clone () from /lib/libc.so.6
BTW- Native OS thread make debugging in gdb much more straightforward.
Yea!
Where The SegFault occurs in vm.c here:
static NODE *
lfp_set_special_cref(VALUE *lfp, NODE * cref)
{
struct RValues *values = (void *) lfp[-1];
///// values = 1 (i.e. FIXNUM(0)) by the line above ////
NODE *old_cref;
if (VMDEBUG) {
check_svar();
}
///// "values" dereference causes segfault in line below /////
if (cref == 0 && ((VALUE)values == Qnil || values->basic.klass == 0)) {
old_cref = 0;
}
else {
old_cref = (NODE *)lfp_svar_get(GET_THREAD(), lfp, 2);
lfp_svar_set(GET_THREAD(), lfp, 2, (VALUE)cref);
}
return old_cref;
}
values is loaded with Ruby FixNum(0) at lfp[-1].
The test values==Qnil is not sufficient to ensure that
it can be dereferenced.
Here's 'C' backtrace from gdb for crash site #2:
static NODE *
lfp_get_special_cref(VALUE *lfp)
{
struct RValues *values;
//attempt to dereference values==1 on next line
if (((VALUE)(values = (void *)lfp[-1])) != Qnil &&
values->basic.klass) {
return (NODE *)values->basic.klass;
}
else {
return 0;
}
}
Here is the corresponding backtrace for this case:
#0 0x080eda28 in lfp_get_special_cref (lfp=0xb7ccc138) at insnhelper.ci:830
#1 0x080edc50 in get_cref (iseq=0x831cea8, lfp=0xb7ccc138) at
insnhelper.ci:924
#2 0x080f5fab in vm_get_cvar_base (th=0x844f5a0, iseq=0x831cea8) at
insnhelper.ci:1047
#3 0x080f1430 in vm_eval (th=0x844f5a0, initial=0) at insns.def:189
#4 0x080f6e72 in vm_eval_body (th=0x844f5a0) at vm.c:1163
#5 0x080f0058 in invoke_block (th=0x844f5a0, block=0x844f540,
self=3083151940, argc=0, argv=0x844f590) at vm.c:583
#6 0x080f01da in vm_invoke_proc (th=0x844f5a0, proc=0x844f540,
self=3083151940, argc=2, argv=0x844f590) at vm.c:622
#7 0x080f9f99 in thread_start_func_2 (th=0x844f5a0,
stack_start=0xb7a103c0) at thread.c:316
#8 0x080f9552 in thread_start_func_1 (th_ptr=0x844f5a0) at
thread_pthread.ci:163
#9 0xb7ed34bb in start_thread () from /lib/libpthread.so.0
#10 0xb7e0b4de in clone () from /lib/libc.so.6
Here's 'C' backtrace from gdb for crash site #3:
static VALUE
rb_const_get_0(VALUE klass, ID id, int exclude, int recurse)
{
VALUE value, tmp;
int mod_retry = 0;
tmp = klass;
retry:
while (tmp && !NIL_P(tmp)) {
/// segfault in this next "while" expression
while (RCLASS_IV_TBL(tmp) &&
st_lookup(RCLASS_IV_TBL(tmp),id,&value)) {
if (value == Qundef) {
if (!RTEST(rb_autoload_load(tmp, id))) break;
continue;
}
if (exclude && tmp == rb_cObject && klass != rb_cObject) {
rb_warn("toplevel constant %s referenced by %s::%s",
rb_id2name(id), rb_class2name(klass),
rb_id2name(id));
}
return value;
}
if (!recurse && klass != rb_cObject) break;
....
And, here is the backtrace corresponding to that final failure site:
#0 0x080e8262 in rb_const_get_0 (klass=138068320, id=8317, exclude=0,
recurse=2) at variable.c:1421
#1 0x080e8352 in rb_const_get (klass=138068320, id=8317) at variable.c:1453
#2 0x080f64a2 in vm_get_ev_const (th=0x844e748, iseq=0x831ad18,
klass=138068320, id=8317, is_defined=0) at insnhelper.ci:1023
#3 0x080f1554 in vm_eval (th=0x844e748, initial=0) at insns.def:225
#4 0x080f6e72 in vm_eval_body (th=0x844e748) at vm.c:1163
#5 0x080f0058 in invoke_block (th=0x844e748, block=0x844e6e8,
self=3083964760, argc=0, argv=0x844e738) at vm.c:583
#6 0x080f01da in vm_invoke_proc (th=0x844e748, proc=0x844e6e8,
self=3083964760, argc=2, argv=0x844e738) at vm.c:622
#7 0x080f9f99 in thread_start_func_2 (th=0x844e748,
stack_start=0xb7bd83c0) at thread.c:316
#8 0x080f9552 in thread_start_func_1 (th_ptr=0x844e748) at
thread_pthread.ci:163
#9 0xb7f994bb in start_thread () from /lib/libpthread.so.0
#10 0xb7ed14de in clone () from /lib/libc.so.6
Given these hints, what might I try next to isolate the bug?
By the way:
This application adds methods and instance variables
to the base Thread class and subclasses
Thread to create its own ScheduleThread class for
simulation of real-time hardware in simulated time.
This works fine in Ruby 1.6.8 and seems OK in 1.8.6
Is it a bad idea in Ruby 1.9?