Thanks, the following patch should fix it.  Your test runs much in less
than 20s and uses 10M on my older x86-64 machine.   It took over 2
minutes before.

(lightly tested, and I'm unfamiliar with the weakmap code):

--- a/gc.c
+++ b/gc.c
@@ -2360,6 +2360,21 @@ define_final0(VALUE obj, VALUE block)
 
     if (st_lookup(finalizer_table, obj, &data)) {
 	table = (VALUE)data;
+
+	/* avoid duplicate block, table is usually small */
+	{
+	    const VALUE *ptr = RARRAY_CONST_PTR(table);
+	    long len = RARRAY_LEN(table);
+	    long i;
+
+	    for (i = 0; i < len; i++, ptr++) {
+		if (rb_funcall(*ptr, idEq, 1, block)) {
+		    rb_gc_force_recycle(block);
+		    return *ptr;
+		}
+	    }
+	}
+
 	rb_ary_push(table, block);
     }
     else {