This is a multi-part message in MIME format.
--------------020601090808060907030704
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: 7bit

The attached patch against Ruby 1.8.6-p110 improves the performance of
multi-threaded applications, especially those that create helper threads
when the parent thread is deeply nested in its call stack.

Totally Bogus Benchmark script #1:
----------------------------

$depth  

def outer name
  if ($depth+ < 2000
    outer name
  else
    250.times {
      Thread.new do
        inner :inner, name, Thread.current
      end.join
    }
  end
end

def inner innerName, outerName, parent
  Thread.new do
    parent.join
    k  roc.new {|n0, n1| q  0.to_s << n1.to_s }
    k[innerName, outerName]
  end
end
   
outer :outer

---------------------

Equally Bogus Benchmark Script #2:
----------------------------

$depth  

def outer name
  ball  hread.current
  inner  hread.new Thread.current do | outer |
    loop {
      Thread.stop until (Thread.criticalue; ball inner)
      (ballter).run
    }
  end
  2000.times {|i|  
    Thread.stop until (Thread.criticalue; ball Thread.current)
    (ballner).run
  }
end

def recurse someArg
  if ($depth+ < 2000
    recurse 3.14159
  else
    outer :outer
  end
end

recurse "top"
-------------------------------

On my 1.5 Ghz Centrino laptop, benchmark #1 executes about twice as fast
after the attached patch is applied.  Benchmark #2 shows a 30% improvement.

Admittedly these benchmarks were contrived, as all benchmarks are,
to demonstrate situations where this patch would yield the greatest
performance
improvement.  However, I don't believe there is any Ruby script that will
execute measurably slower or take more memory after this patch is applied.
(Yes, I'm challenging someone to prove me wrong there :-)

Unpatched, 'C' Ruby allows each thread's backtrace to chain up into the
frames of its parent, frozen at the time the child was first created.
This means that context switches require copying not only the active
thread's
frames, but all of the parent's as well.  Further, creating new Proc
object's
in the child thread requires that "framedup()" copy the parent's frames as
well as the child's.  Moreover, the frozen parent's stack keeps the garbage
collector from releasing objects referenced there, even after those
references are no longer present in the active parent thread as it
continues to execute (or dies).

This patch works by isolating each thread's stack from that of its parent.
There is still just one stack, but when a new thread is created, its current
"ruby_frame" is linked directly back to Ruby's "top_frame".  Thus, each
thread
behaves as through it was started with its own empty stack.  When a context
switch is performed, only the slice of the stack that is accessible by
the active
thread need by copied from the heap.  Whenever it is necessary
to duplicate the active thread's frames, only those are copied, not the
parent's
as well.  This saves both time and space.  The only "cost" is an additional
pointer in the (already large) "struct thread"

If all this seems a bit too good to be true, it just might be.
I've tested the patch against our multi-threaded "killer" app that
managed to crash all unpatched versions of Ruby 1.6.8, 1.8.x, 1.9 and JRuby.
If you've got a multi-threaded test or application, I invite you to
try it against this patch.  Feedback would be very much appreciated.

Yes, there are a couple side effects:

1)  ruby backtraces from child threads will now end at the thread's
creation.
     Frames from other thread's will no longer be included in any backtrace.
     I strongly suspect that Ruby 1.9, JRuby and other "native threaded"
     Ruby implementations will behave this way as well.
     (Can anyone confirm this?)

2)  The 'C'-level stack will become corrupt as slices of it get copied
     into and out of the heap.  If you are using a 'C'-level debugger, its
     backtrace command will get confused after it ascends the stack
     into frames above those of the current Ruby thread.

Background:
I noticed this monolithic stack copying behavior while debugging a
couple segfault causing bugs I'd found in ruby 1.8.6 and 1.6.8.
These resulted in some instances when a child thread
outlived its parents.  (See my recent post:  Continuations on dead
threads ...)

In 1.6.8, the situation is worse.  Without the appropriate version of
this patch,
the "newest" 1.6.8 snapshot will very occasionally crash running these
benchmarks.
The good news is that benchmark #1 runs 5 times faster after 1.6.8 is
patched :-)
Benchmark #2 runs  twice as fast.  If anyone, aside from me, is still
interested
in Ruby v1.6.8, just let me know.  I'll be happy to post a patch for it.

- brent



--------------020601090808060907030704
Content-Type: text/x-patch;
 namehread18.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filenamehread18.patch"

Index: intern.h
RCS file: /home/cvs/ESP/gen2/software/ruby/ruby-1.8.6-mbari/intern.h,v
retrieving revision 1.1
diff -u -r1.1 intern.h
--- intern.h	18 Dec 2007 01:10:32 -0000	1.1
+++ intern.h	19 Dec 2007 00:17:06 -0000
@@ -238,7 +238,7 @@
 /* gc.c */
 NORETURN(void rb_memerror __((void)));
 int ruby_stack_check _((void));
-int ruby_stack_length _((VALUE**));
+int ruby_stack_length _((VALUE *,VALUE**));
 char *rb_source_filename _((const char*));
 void rb_gc_mark_locations _((VALUE*, VALUE*));
 void rb_mark_tbl _((struct st_table*));
Index: node.h
RCS file: /home/cvs/ESP/gen2/software/ruby/ruby-1.8.6-mbari/node.h,v
retrieving revision 1.1
diff -u -r1.1 node.h
--- node.h	18 Dec 2007 02:18:44 -0000	1.1
+++ node.h	19 Dec 2007 00:17:07 -0000
@@ -408,10 +408,8 @@
 
     VALUE result;
 
-    long   stk_len;
-    long   stk_max;
-    VALUE *stk_ptr;
-    VALUE *stk_pos;
+    long   stk_len, stk_max;
+    VALUE *stk_ptr, *stk_pos, *stk_start;
 #ifdef __ia64
     long   bstr_len;
     long   bstr_max;
Index: gc.c
RCS file: /home/cvs/ESP/gen2/software/ruby/ruby-1.8.6-mbari/gc.c,v
retrieving revision 1.1
diff -u -r1.1 gc.c
--- gc.c	10 Nov 2007 01:08:15 -0000	1.1
+++ gc.c	19 Dec 2007 00:16:27 -0000
@@ -450,14 +500,14 @@
 # define STACK_END (stack_end)
 #endif
 #if defined(sparc) || defined(__sparc__)
-# define STACK_LENGTH  (rb_gc_stack_start - STACK_END + 0x80)
+# define STACK_LENGTH(start)  ((start) - STACK_END + 0x80)
 #elif STACK_GROW_DIRECTION < 0
-# define STACK_LENGTH  (rb_gc_stack_start - STACK_END)
+# define STACK_LENGTH(start)  ((start) - STACK_END)
 #elif STACK_GROW_DIRECTION > 0
-# define STACK_LENGTH  (STACK_END - rb_gc_stack_start + 1)
+# define STACK_LENGTH  (STACK_END - (start) + 1)
 #else
-# define STACK_LENGTH  ((STACK_END < rb_gc_stack_start) ? rb_gc_stack_start - STACK_END\
-                                           : STACK_END - rb_gc_stack_start + 1)
+# define STACK_LENGTH  ((STACK_END < (start)) ? \
+                         (start) - STACK_END  :  STACK_END - (start) + 1)
 #endif
 #if STACK_GROW_DIRECTION > 0
 # define STACK_UPPER(x, a, b) a
@@ -481,17 +531,17 @@
 #define GC_WATER_MARK 512
 
 #define CHECK_STACK(ret) do {\
-    SET_STACK_END;\
-    (ret)  STACK_LENGTH > STACK_LEVEL_MAX + GC_WATER_MARK);\
+  SET_STACK_END;\
+  (ret)  STACK_LENGTH(rb_gc_stack_start) > STACK_LEVEL_MAX + GC_WATER_MARK);\
 } while (0)
 
 int
-ruby_stack_length(p)
-    VALUE **p;
+ruby_stack_length(start, base)
+    VALUE *start, **base;
 {
     SET_STACK_END;
-    if (p) *p  TACK_UPPER(STACK_END, rb_gc_stack_start, STACK_END);
-    return STACK_LENGTH;
+    if (base) *base  TACK_UPPER(STACK_END, start, STACK_END);
+    return STACK_LENGTH(start);
 }
 
 int
@@ -1321,7 +1365,7 @@
 garbage_collect()
 {
     struct gc_list *list;
-    struct FRAME * volatile frame; /* gcc 2.7.2.3 -O2 bug??  */
+    struct FRAME *frame;
     jmp_buf save_regs_gc_mark;
     SET_STACK_END;
 
Index: eval.c
RCS file: /home/cvs/ESP/gen2/software/ruby/ruby-1.8.6-mbari/eval.c,v
retrieving revision 1.2
diff -u -r1.2 eval.c
--- eval.c	2007-09-22 17:01:50.000000000 -0700
+++ ../ruby-1.8.6-mbari/eval.c	2007-12-18 00:18:24.000000000 -0800
@@ -6509,6 +6509,7 @@
 
 	    scope_dup(ruby_scope);
 	    for (tagot_tag; tag; tag
g->prev) {
+                if (tag->tag PROT_THREAD) break;
 		scope_dup(tag->scope);
 	    }
 	    for (vars  uby_dyna_vars; vars; vars  ars->next) {
@@ -10216,13 +10324,10 @@
 rb_thread_save_context(th)
     rb_thread_t th;
 {
-    VALUE *pos;
     int len;
     static VALUE tval;
 
-    len  uby_stack_length(&pos);
-    th->stk_len  ;
-    th->stk_pos  os;
+    len  uby_stack_length(th->stk_start,&th->stk_pos);
     if (len > th->stk_max) {
 	VALUE *ptr  ealloc(th->stk_ptr, sizeof(VALUE) * len);
 	if (!ptr) rb_memerror();
@@ -11721,6 +11826,7 @@
     th->result  ;\
     th->flags  ;\
 \
+    th->stk_start  b_gc_stack_start;\
     th->stk_ptr  ;\
     th->stk_len  ;\
     th->stk_max  ;\
@@ -11932,6 +12038,15 @@
 		 "can't start a new thread (frozen ThreadGroup)");
     }
 
+    th->stk_start   /* establish start of new thread's stack */
+#if STACK_GROW_DIRECTION > 0
+      (VALUE *)ruby_frame;
+#elif STACK_GROW_DIRECTION < 0
+      (VALUE *)(ruby_frame+1);
+#else
+      (VALUE *)(ruby_frame+((VALUE *)(&arg)<rb_gc_stack_start))
+#endif
+
     if (!thread_init) {
 	thread_init  ;
 #if defined(HAVE_SETITIMER) || defined(_THREAD_SAFE)
@@ -11977,6 +12092,8 @@
     PUSH_TAG(PROT_THREAD);
     if ((state  XEC_TAG()) 0) {
 	if (THREAD_SAVE_CONTEXT(th) 0) {
+            ruby_frame->prev  op_frame;     /* hide parent thread's frames */
+            ruby_frame->tmp  ;   /* <-- not sure whether this is necessary */           
 	    curr_thread  h;
 	    th->result  *fn)(arg, th);
 	}
@@ -12090,7 +12207,7 @@
     rb_thread_t th  b_thread_alloc(klass);
     volatile VALUE *pos;
 
-    pos  h->stk_pos;
+    pos  h->stk_pos;   //brent / mbari.org wants to know what this assignment does?
     rb_obj_call_init(th->thread, argc, argv);
     if (th->stk_pos 0) {
 	rb_raise(rb_eThreadError, "uninitialized thread - check `%s#initialize'",

--------------020601090808060907030704--