Hi,

In message "Re: Copy-on-write friendly garbage collector"
    on Mon, 3 Mar 2008 18:48:34 +0900, Hongli Lai <hongli / plan99.net> writes:

|I've written patch which makes Ruby's garbage collector copy-on-write 
|friendly. Details can be found on my blog, 
|http://izumi.plan99.net/blog/index.php/category/optimizing-rails/, in 
|the "Making Ruby's garbage collector copy-on-write friendly" series.
|
|Matz had shown interest in merging the patch into Ruby 1.9. I'm 
|wondering whether that has already been done, and if not, whether I can 
|be of any assistance.

Here's the patch against the latest trunk (r15675).  It's still 8-10%
slower than the current implementation.

							matz.
diff --git a/debug.c b/debug.c
index d7f99ed..dfcb523 100644
--- a/debug.c
+++ b/debug.c
@@ -29,8 +29,8 @@ static const union {
         RUBY_ENC_CODERANGE_7BIT    = ENC_CODERANGE_7BIT,
         RUBY_ENC_CODERANGE_VALID   = ENC_CODERANGE_VALID,
         RUBY_ENC_CODERANGE_BROKEN  = ENC_CODERANGE_BROKEN, 
-        RUBY_FL_MARK        = FL_MARK,
-        RUBY_FL_RESERVED    = FL_RESERVED,
+        RUBY_FL_RESERVED0   = FL_RESERVED0,
+        RUBY_FL_RESERVED1   = FL_RESERVED1,
         RUBY_FL_FINALIZE    = FL_FINALIZE,
         RUBY_FL_TAINT       = FL_TAINT,
         RUBY_FL_EXIVAR      = FL_EXIVAR,
diff --git a/gc.c b/gc.c
index c47f8a0..38a9fe7 100644
--- a/gc.c
+++ b/gc.c
@@ -22,8 +22,14 @@
 #include "gc.h"
 #include <stdio.h>
 #include <setjmp.h>
+#include <math.h>
 #include <sys/types.h>
 
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
 #ifdef HAVE_SYS_TIME_H
 #include <sys/time.h>
 #endif
@@ -145,6 +151,8 @@ static struct heaps_slot {
     void *membase;
     RVALUE *slot;
     int limit;
+    int *marks;
+    int marks_size;
 } *heaps;
 static int heaps_length = 0;
 static int heaps_used   = 0;
@@ -322,6 +330,36 @@ ruby_xfree(void *x)
 	RUBY_CRITICAL(free(x));
 }
 
+static int debugging = 0;
+
+#define DEBUG_POINT(message) \
+	do { \
+		if (debugging) { \
+			printf("%s\n", message); \
+			getchar(); \
+		} \
+	} while (0)
+
+#define OPTION_ENABLED(name) (getenv((name)) && *getenv((name)) && *getenv((name)) != '0')
+
+static void *
+alloc_ruby_heap(size_t size)
+{
+    return malloc(size);
+}
+
+static void
+free_ruby_heap(void *heap)
+{
+    free(heap);
+}
+
+static void
+init_debugging()
+{
+    debugging = OPTION_ENABLED("RUBY_GC_DEBUG");
+}
+
 
 /*
  *  call-seq:
@@ -413,6 +451,106 @@ rb_gc_unregister_address(VALUE *addr)
     }
 }
 
+static struct heaps_slot *last_heap = NULL;
+
+static inline struct heaps_slot *
+find_heap_slot_for_object(RVALUE *object)
+{
+    struct heaps_slot *heap;
+    register long hi, lo, mid;
+
+    /* Look in the cache first. */
+    if (last_heap != NULL && object >= last_heap->slot
+	&& object < last_heap->slot + last_heap->limit) {
+	return last_heap;
+    }
+    /* find heap_slot for object using bsearch*/
+    lo = 0;
+    hi = heaps_used;
+    while (lo < hi) {
+	mid = (lo + hi) / 2;
+	heap = &heaps[mid];
+	if (heap->slot <= object) {
+	    if (object < heap->slot + heap->limit) {
+		/* Cache this result. According to empirical evidence, the chance is
+		 * high that the next lookup will be for the same heap slot.
+		 */
+		last_heap = heap;
+		return heap;
+	    }
+	    lo = mid + 1;
+	}
+	else {
+	    hi = mid;
+	}
+    }
+    return NULL;
+}
+
+static inline void
+find_position_in_bitfield(struct heaps_slot *hs, RVALUE *object,
+                          unsigned int *bitfield_index, unsigned int *bitfield_offset)
+{
+    unsigned int index;
+
+    index = object - hs->slot;
+    *bitfield_index = index / (sizeof(int) * 8);
+    *bitfield_offset = index % (sizeof(int) * 8);
+}
+
+
+static void
+rb_mark_table_add(RVALUE *object)
+{
+    struct heaps_slot *hs;
+    unsigned int bitfield_index, bitfield_offset;
+
+    hs = find_heap_slot_for_object(object);
+    if (hs != NULL) {
+	find_position_in_bitfield(hs, object, &bitfield_index, &bitfield_offset);
+	hs->marks[bitfield_index] |= (1 << bitfield_offset);
+    }
+}
+
+static inline int
+rb_mark_table_heap_contains(struct heaps_slot *hs, RVALUE *object)
+{
+    unsigned int bitfield_index, bitfield_offset;
+
+    find_position_in_bitfield(hs, object, &bitfield_index, &bitfield_offset);
+    return hs->marks[bitfield_index] & (1 << bitfield_offset);
+}
+
+static inline int
+rb_mark_table_contains(RVALUE *object)
+{
+    struct heaps_slot *hs;
+
+    hs = find_heap_slot_for_object(object);
+    if (hs != NULL) {
+	return rb_mark_table_heap_contains(hs, object);
+    }
+}
+
+static inline void
+rb_mark_table_heap_remove(struct heaps_slot *hs, RVALUE *object)
+{
+    unsigned int bitfield_index, bitfield_offset;
+    find_position_in_bitfield(hs, object, &bitfield_index, &bitfield_offset);
+    hs->marks[bitfield_index] &= ~(1 << bitfield_offset);
+}
+
+static void
+rb_mark_table_remove(RVALUE *object)
+{
+    struct heaps_slot *hs;
+
+    hs = find_heap_slot_for_object(object);
+    if (hs != NULL) {
+	rb_mark_table_heap_remove(hs, object);
+    }
+}
+
 static int
 heap_cmp(const void *ap, const void *bp, void *dummy)
 {
@@ -445,7 +583,7 @@ add_heap(void)
     }
 
     for (;;) {
-	RUBY_CRITICAL(p = (RVALUE*)malloc(sizeof(RVALUE)*(heap_slots+1)));
+	RUBY_CRITICAL(p = (RVALUE*)alloc_ruby_heap(sizeof(RVALUE)*(heap_slots+1)));
 	if (p == 0) {
 	    if (heap_slots == HEAP_MIN_SLOTS) {
 		rb_memerror();
@@ -460,6 +598,8 @@ add_heap(void)
 	    p = (RVALUE*)((VALUE)p + sizeof(RVALUE) - ((VALUE)p % sizeof(RVALUE)));
 	heaps[heaps_used].slot = p;
 	heaps[heaps_used].limit = heap_slots;
+        heaps[heaps_used].marks_size = (int) (ceil(heap_slots / (sizeof(int) * 8.0)));
+        heaps[heaps_used].marks = (int *) calloc(heaps[heaps_used].marks_size, sizeof(int));
 	break;
     }
     pend = p + heap_slots;
@@ -494,6 +634,7 @@ rb_newobj_from_heap(void)
     freelist = freelist->as.free.next;
 
     MEMZERO((void*)obj, RVALUE, 1);
+    RANY(obj)->as.free.flags = 0;
 #ifdef GC_DEBUG
     RANY(obj)->file = rb_sourcefile();
     RANY(obj)->line = rb_sourceline();
@@ -702,8 +843,7 @@ gc_mark_all(void)
     for (i = 0; i < heaps_used; i++) {
 	p = heaps[i].slot; pend = p + heaps[i].limit;
 	while (p < pend) {
-	    if ((p->as.basic.flags & FL_MARK) &&
-		(p->as.basic.flags != FL_MARK)) {
+	    if (rb_mark_table_contains(p) && (p->as.basic.flags != 0)) {
 		gc_mark_children((VALUE)p, 0);
 	    }
 	    p++;
@@ -737,21 +877,8 @@ is_pointer_to_heap(void *ptr)
     if (p < lomem || p > himem) return Qfalse;
     if ((VALUE)p % sizeof(RVALUE) != 0) return Qfalse;
 
-    /* check if p looks like a pointer using bsearch*/
-    lo = 0;
-    hi = heaps_used;
-    while (lo < hi) {
-	mid = (lo + hi) / 2;
-	heap = &heaps[mid];
-	if (heap->slot <= p) {
-	    if (p < heap->slot + heap->limit)
-		return Qtrue;
-	    lo = mid + 1;
-	}
-	else {
-	    hi = mid;
-	}
-    }
+    if (find_heap_slot_for_object(p))
+	return Qtrue;
     return Qfalse;
 }
 
@@ -857,8 +984,8 @@ gc_mark(VALUE ptr, int lev)
     obj = RANY(ptr);
     if (rb_special_const_p(ptr)) return; /* special const not marked */
     if (obj->as.basic.flags == 0) return;       /* free cell */
-    if (obj->as.basic.flags & FL_MARK) return;  /* already marked */
-    obj->as.basic.flags |= FL_MARK;
+    if (rb_mark_table_contains(obj)) return;  /* already marked */
+    rb_mark_table_add(obj);
 
     if (lev > GC_LEVEL_MAX || (lev == 0 && ruby_stack_check())) {
 	if (!mark_stack_overflow) {
@@ -892,8 +1019,8 @@ gc_mark_children(VALUE ptr, int lev)
     obj = RANY(ptr);
     if (rb_special_const_p(ptr)) return; /* special const not marked */
     if (obj->as.basic.flags == 0) return;       /* free cell */
-    if (obj->as.basic.flags & FL_MARK) return;  /* already marked */
-    obj->as.basic.flags |= FL_MARK;
+    if (rb_mark_table_contains(obj)) return;  /* already marked */
+    rb_mark_table_add(obj);
 
   marking:
     if (FL_TEST(obj, FL_EXIVAR)) {
@@ -1147,10 +1274,15 @@ finalize_list(RVALUE *p)
     while (p) {
 	RVALUE *tmp = p->as.free.next;
 	run_final((VALUE)p);
-	if (!FL_TEST(p, FL_SINGLETON)) { /* not freeing page */
+	/* Don't free objects that are singletons, or objects that are already freed.
+	 * The latter is to prevent the unnecessary marking of memory pages as dirty,
+	 * which can destroy copy-on-write semantics.
+	 */
+	if (!FL_TEST(p, FL_SINGLETON) && p->as.free.flags != 0) {
             VALGRIND_MAKE_MEM_UNDEFINED((void*)p, sizeof(RVALUE));
 	    p->as.free.flags = 0;
 	    p->as.free.next = freelist;
+	    rb_mark_table_remove(p);
 	    freelist = p;
 	}
 	p = tmp;
@@ -1164,7 +1296,8 @@ free_unused_heaps(void)
 
     for (i = j = 1; j < heaps_used; i++) {
 	if (heaps[i].limit == 0) {
-	    free(heaps[i].membase);
+	    free_ruby_heap(heaps[i].membase);
+	    free(heaps[i].marks);
 	    heaps_used--;
 	}
 	else {
@@ -1208,29 +1341,34 @@ gc_sweep(void)
 
 	p = heaps[i].slot; pend = p + heaps[i].limit;
 	while (p < pend) {
-	    if (!(p->as.basic.flags & FL_MARK)) {
+	    if (!rb_mark_table_contains(p)) {
 		if (p->as.basic.flags) {
 		    obj_free((VALUE)p);
 		}
 		if (need_call_final && FL_TEST(p, FL_FINALIZE)) {
-		    p->as.free.flags = FL_MARK; /* remain marked */
+		    p->as.free.flags = FL_FINALIZE;
 		    p->as.free.next = final_list;
 		    final_list = p;
 		}
 		else {
                     VALGRIND_MAKE_MEM_UNDEFINED((void*)p, sizeof(RVALUE));
-		    p->as.free.flags = 0;
-		    p->as.free.next = freelist;
+		    /* Do not touch the fields if they don't have to be modified.
+		     * This is in order to preserve copy-on-write semantics.
+		     */
+		    if (p->as.free.flags != 0)
+			p->as.free.flags = 0;
+		    if (p->as.free.next != freelist)
+			p->as.free.next = freelist;
 		    freelist = p;
 		}
 		n++;
 	    }
-	    else if (RBASIC(p)->flags == FL_MARK) {
+	    else if (RBASIC(p)->flags == FL_FINALIZE) {
 		/* objects to be finalized */
-		/* do nothing remain marked */
+		/* do nothing here */
 	    }
 	    else {
-		RBASIC(p)->flags &= ~FL_MARK;
+		rb_mark_table_heap_remove(&heaps[i], p);
 		live++;
 	    }
 	    p++;
@@ -1272,6 +1410,7 @@ rb_gc_force_recycle(VALUE p)
     VALGRIND_MAKE_MEM_UNDEFINED((void*)p, sizeof(RVALUE));
     RANY(p)->as.free.flags = 0;
     RANY(p)->as.free.next = freelist;
+    rb_mark_table_remove((RVALUE *) p);
     freelist = RANY(p);
 }
 
@@ -1462,6 +1601,7 @@ mark_current_machine_context(rb_thread_t *th)
 
     FLUSH_REGISTER_WINDOWS;
     /* This assumes that all registers are saved into the jmp_buf (and stack) */
+    memset(save_regs_gc_mark, 0, sizeof(save_regs_gc_mark));
     setjmp(save_regs_gc_mark);
     mark_locations_array((VALUE*)save_regs_gc_mark,
 			 sizeof(save_regs_gc_mark) / sizeof(VALUE));
@@ -1501,6 +1641,7 @@ garbage_collect(void)
 
     SET_STACK_END;
 
+    last_heap = NULL;
     init_mark_stack();
 
     th->vm->self ? rb_gc_mark(th->vm->self) : rb_vm_mark(th->vm);
@@ -1602,6 +1743,18 @@ ruby_set_stack_size(size_t size)
     rb_gc_stack_maxsize = size;
 }
 
+int
+rb_gc_is_thread_marked(the_thread)
+    VALUE the_thread;
+{
+    if (FL_ABLE(the_thread)) {
+	return rb_mark_table_contains((RVALUE *) the_thread);
+    }
+    else {
+	return 0;
+    }
+}
+
 void
 Init_stack(VALUE *addr)
 {
@@ -2037,6 +2190,7 @@ rb_gc_call_finalizer_at_exit(void)
 		DATA_PTR(p) && RANY(p)->as.data.dfree &&
 		RANY(p)->as.basic.klass != rb_cThread) {
 		p->as.free.flags = 0;
+		rb_mark_table_remove(p);
 		if ((long)RANY(p)->as.data.dfree == -1) {
 		    RUBY_CRITICAL(free(DATA_PTR(p)));
 		}
@@ -2048,6 +2202,7 @@ rb_gc_call_finalizer_at_exit(void)
 	    else if (BUILTIN_TYPE(p) == T_FILE) {
 		if (rb_io_fptr_finalize(RANY(p)->as.file.fptr)) {
 		    p->as.free.flags = 0;
+		    rb_mark_table_remove(p);
                     VALGRIND_MAKE_MEM_UNDEFINED((void*)p, sizeof(RVALUE));
 		}
 	    }
@@ -2268,6 +2423,61 @@ count_objects(int argc, VALUE *argv, VALUE os)
     return hash;
 }
 
+static VALUE
+os_statistics()
+{
+    int i;
+    int n = 0;
+    unsigned int objects = 0;
+    unsigned int total_heap_size = 0;
+    unsigned int ast_nodes = 0;
+    char message[1024];
+
+    for (i = 0; i < heaps_used; i++) {
+	RVALUE *p, *pend;
+
+	p = heaps[i].slot;
+	pend = p + heaps[i].limit;
+	for (;p < pend; p++) {
+	    if (p->as.basic.flags) {
+		int isAST = 0;
+		switch (TYPE(p)) {
+		  case T_ICLASS:
+		  case T_NODE:
+		    isAST = 1;
+		    break;
+		  case T_CLASS:
+		    if (FL_TEST(p, FL_SINGLETON)) {
+		        isAST = 1;
+		        break;
+		    }
+		  default:
+		    break;
+		}
+		objects++;
+		if (isAST) {
+		   ast_nodes++;
+		}
+	    }
+	}
+	total_heap_size += (void*)pend - heaps[i].membase;
+    }
+
+    snprintf(message, sizeof(message),
+        "Number of objects: %d (%d AST nodes, %.2f%%)\n"
+        "Heap slot size: %d\n"
+        "Number of heaps: %d\n"
+        "Total size of objects: %.2f KB\n"
+        "Total size of heaps: %.2f KB\n",
+        objects, ast_nodes, ast_nodes * 100 / (double) objects,
+        sizeof(RVALUE),
+        heaps_used,
+        objects * sizeof(RVALUE) / 1024.0,
+        total_heap_size / 1024.0
+    );
+    return rb_str_new2(message);
+}
+
 /*
  *  The <code>GC</code> module provides an interface to Ruby's mark and
  *  sweep garbage collection mechanism. Some of the underlying methods
@@ -2300,6 +2510,8 @@ Init_GC(void)
 
     rb_define_module_function(rb_mObSpace, "_id2ref", id2ref, 1);
 
+    rb_define_module_function(rb_mObSpace, "statistics", os_statistics, 0);
+
     rb_gc_register_address(&rb_mObSpace);
     rb_global_variable(&finalizers);
     rb_gc_unregister_address(&rb_mObSpace);
@@ -2315,4 +2527,6 @@ Init_GC(void)
     rb_define_method(rb_mKernel, "object_id", rb_obj_id, 0);
 
     rb_define_module_function(rb_mObSpace, "count_objects", count_objects, -1);
+
+    init_debugging();
 }
diff --git a/include/ruby/ruby.h b/include/ruby/ruby.h
index 4438bc3..1626a7e 100644
--- a/include/ruby/ruby.h
+++ b/include/ruby/ruby.h
@@ -621,8 +621,8 @@ struct RBignum {
 #define RVALUES(obj) (R_CAST(RValues)(obj))
 
 #define FL_SINGLETON FL_USER0
-#define FL_MARK      (((VALUE)1)<<5)
-#define FL_RESERVED  (((VALUE)1)<<6) /* will be used in the future GC */
+#define FL_RESERVED0 (((VALUE)1)<<5) /* will be used in the future GC */
+#define FL_RESERVED1 (((VALUE)1)<<6) /* will be used in the future GC */
 #define FL_FINALIZE  (((VALUE)1)<<7)
 #define FL_TAINT     (((VALUE)1)<<8)
 #define FL_EXIVAR    (((VALUE)1)<<9)
@@ -716,6 +716,7 @@ void rb_global_variable(VALUE*);
 void rb_register_mark_object(VALUE);
 void rb_gc_register_address(VALUE*);
 void rb_gc_unregister_address(VALUE*);
+int rb_gc_is_thread_marked(VALUE);
 
 ID rb_intern(const char*);
 ID rb_intern2(const char*, long);