ワナベと申します。

rb_yield を連続で呼び出すときに、積んだフレームを使いまわす関数を書きました。
試験的に Fixnum#times にだけ組み込んでいますが、ほかにも
Array#each, Range#each などで使えると思います。

ruby -e 'GC.disable;t=Time.now;(10**7).times {};p Time.now - t'
こんな感じの簡単な計測では 4.9秒から 3.5 秒ほどに縮まりました。
一応手元では make test と make test-all の動作を確認してあります。


Index: eval.c
===================================================================
--- eval.c	(revision 16218)
+++ eval.c	(working copy)
@@ -960,6 +960,24 @@
     return v;
 }

+rb_block_t *vm_yield_fast_setup(rb_thread_t *th, int argc); /* vm.c */
+void vm_yield_fast_finish(rb_thread_t *th, rb_block_t *block); /* vm.c */
+
+VALUE
+rb_yield_fast(VALUE (*i_proc)(ANYARGS), VALUE val)
+{
+    rb_thread_t *th = GET_THREAD();
+    rb_block_t *block = vm_yield_fast_setup(th, 1);
+    i_proc(val, (void*)block);
+    vm_yield_fast_finish(th, block);
+}
+
+VALUE
+rb_yield_fast_call(VALUE val, void *block)
+{
+    return vm_yield_fast(GET_THREAD(), (rb_block_t *)block, 1, &val);
+}
+
 static VALUE
 loop_i()
 {
Index: numeric.c
===================================================================
--- numeric.c	(revision 16218)
+++ numeric.c	(working copy)
@@ -3016,17 +3016,23 @@
  */

 static VALUE
+int_dotimes_i(VALUE num, void *block)
+{
+    long i, end;
+
+    end = FIX2LONG(num);
+    for (i=0; i<end; i++) {
+	rb_yield_fast_call(LONG2FIX(i), block);
+    }
+}
+
+static VALUE
 int_dotimes(VALUE num)
 {
     RETURN_ENUMERATOR(num, 0, 0);

     if (FIXNUM_P(num)) {
-	long i, end;
-
-	end = FIX2LONG(num);
-	for (i=0; i<end; i++) {
-	    rb_yield(LONG2FIX(i));
-	}
+	rb_yield_fast(int_dotimes_i, num);
     }
     else {
 	VALUE i = INT2FIX(0);
Index: vm.c
===================================================================
--- vm.c	(revision 16218)
+++ vm.c	(working copy)
@@ -541,11 +541,10 @@

 /* C -> Ruby: block */

-static VALUE
-invoke_block(rb_thread_t *th, rb_block_t *block, VALUE self,
+static inline void
+setup_block(rb_thread_t *th, rb_block_t *block, VALUE self,
 	     int argc, VALUE *argv, rb_block_t *blockptr)
 {
-    VALUE val;
     if (BUILTIN_TYPE(block->iseq) != T_NODE) {
 	rb_iseq_t *iseq = block->iseq;
 	rb_control_frame_t *cfp = th->cfp;
@@ -557,8 +556,14 @@

 	CHECK_STACK_OVERFLOW(cfp, argc + iseq->stack_max);

-	for (i=0; i<argc; i++) {
-	    cfp->sp[i] = argv[i];
+	if (argv) {
+	    for (i=0; i<argc; i++) {
+		cfp->sp[i] = argv[i];
+	    }
+	} else {
+	    for (i=0; i<argc; i++) {
+		cfp->sp[i] = Qnil;
+	    }
 	}

 	opt_pc = vm_yield_setup_args(th, iseq, argc, cfp->sp, blockptr,
@@ -568,7 +573,15 @@
 		      self, GC_GUARDED_PTR(block->dfp),
 		      iseq->iseq_encoded + opt_pc, cfp->sp + arg_size, block->lfp,
 		      iseq->local_size - arg_size);
+    }
+}

+static inline VALUE
+invoke_block_fast(rb_thread_t *th, rb_block_t *block, VALUE self,
+	     int argc, VALUE *argv, rb_block_t *blockptr)
+{
+    VALUE val;
+    if (BUILTIN_TYPE(block->iseq) != T_NODE) {
 	val = vm_eval_body(th);
     }
     else {
@@ -577,19 +590,85 @@
     return val;
 }

-VALUE
-vm_yield(rb_thread_t *th, int argc, VALUE *argv)
+static VALUE
+invoke_block(rb_thread_t *th, rb_block_t *block, VALUE self,
+	     int argc, VALUE *argv, rb_block_t *blockptr)
 {
-    rb_block_t *block = GC_GUARDED_PTR_REF(th->cfp->lfp[0]);
+    VALUE val;
+    setup_block(th, block, self, argc, argv, blockptr);
+    return invoke_block_fast(th, block, self, argc, argv, blockptr);
+}

+static inline rb_block_t *
+vm_get_block(rb_control_frame_t *cfp)
+{
+    rb_block_t *block = GC_GUARDED_PTR_REF(cfp->lfp[0]);
+
     if (block == 0) {
 	vm_localjump_error("no block given", Qnil, 0);
     }
+    return block;
+}

+VALUE
+vm_yield(rb_thread_t *th, int argc, VALUE *argv)
+{
+    rb_block_t *block = vm_get_block(th->cfp);
     return invoke_block(th, block, block->self, argc, argv, 0);
 }

+rb_block_t *
+vm_yield_fast_setup(rb_thread_t *th, int argc)
+{
+    rb_block_t *block = vm_get_block(th->cfp);
+    setup_block(th, block, block->self, argc, 0, 0);
+    return block;
+}
+
 VALUE
+vm_yield_fast(rb_thread_t *th, rb_block_t *block, int argc, VALUE *argv)
+{
+    int i;
+    int opt_pc;
+    VALUE retval;
+    VALUE pre_dfp;
+
+    if (BUILTIN_TYPE(block->iseq) != T_NODE) {
+	CHECK_STACK_OVERFLOW(th->cfp, argc + block->iseq->stack_max);
+	for (i=0; i<argc; i++) {
+	    th->cfp->sp[i] = argv[i];
+	}
+	opt_pc = vm_yield_setup_args(th, block->iseq, argc, th->cfp->sp, 0,
+			    block_proc_is_lambda(block->proc));
+	for (i=0; i<block->iseq->arg_size; i++) {
+	    th->cfp[2].sp[i] = th->cfp->sp[i];
+	}
+	th->cfp->pc = block->iseq->iseq_encoded + opt_pc;
+	th->cfp->dfp[0] = GC_GUARDED_PTR(block->dfp);
+    }
+    retval = invoke_block_fast(th, block, block->self, argc, argv, 0);
+    if (BUILTIN_TYPE(block->iseq) != T_NODE) {
+	for (i=0; i<block->iseq->local_size; i++) {
+	    th->cfp->sp[i] = Qnil;
+	}
+	th->cfp--;
+	th->cfp->pc--;
+	th->cfp--;
+    }
+    return retval;
+}
+
+void
+vm_yield_fast_finish(rb_thread_t *th, rb_block_t *block)
+{
+    if (BUILTIN_TYPE(block->iseq) != T_NODE) {
+	th->cfp++;
+	th->cfp->pc++;
+	th->cfp++;
+    }
+}
+
+VALUE
 vm_invoke_proc(rb_thread_t *th, rb_proc_t *proc,
 	       VALUE self, int argc, VALUE *argv, rb_block_t *blockptr)
 {


-- 
ワナベ