This is a multi-part message in MIME format.
--------------070506010207040603010401
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

Currently Marshal.dump stores Hash entries in no specific order. This 
prevents one from reliably comparing marshalled data in order to 
determine whether a data structure changed. For example, consider the 
following code:

hash   'root' [1, [1, 1.5], 3, 4], 'foo' 'zort' }
data  arshal.dump(hash)
data2  arshal.dump(Marshal.load(data))

"data ! ata2" might be possible.

PStore uses an MD5 checksum of the marshalled data to check whether the 
file needs to be updated. Marshal.dump's non-deterministic hash ordering 
gets in the way.

The attached patch adds a 'canonical' argument to Marshal.dump. This 
argument allows Hash entries to be stored in a deterministic order, at 
the cost of a slight performance hit. (PStore will have to be modified 
to take advantage of this, but I'm already working on that.) The patch 
is made against Ruby 1.8.6.

Regards,
Hongli Lai

--------------070506010207040603010401
Content-Type: text/x-patch;
 nameuby-marshal-dump-canonical.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filenameuby-marshal-dump-canonical.diff"

diff --git a/marshal.c b/marshal.c
index 185ace6..8643894 100644
--- a/marshal.c
+++ b/marshal.c
@@ -83,6 +83,7 @@ shortlen(len, ds)
 static ID s_dump, s_load, s_mdump, s_mload;
 static ID s_dump_data, s_load_data, s_alloc;
 static ID s_getc, s_read, s_write, s_binmode;
+static ID id_rehash;
 
 struct dump_arg {
     VALUE obj;
@@ -90,6 +91,7 @@ struct dump_arg {
     st_table *symbols;
     st_table *data;
     int taint;
+    int canonical;
 };
 
 struct dump_call_arg {
@@ -621,6 +623,9 @@ w_object(obj, arg, limit)
 		w_byte(TYPE_HASH_DEF, arg);
 	    }
 	    w_long(RHASH(obj)->tbl->num_entries, arg);
+	    if (arg->canonical) {
+		rb_funcall(obj, id_rehash, 0);
+	    }
 	    rb_hash_foreach(obj, hash_each, (st_data_t)&c_arg);
 	    if (!NIL_P(RHASH(obj)->ifnone)) {
 		w_object(RHASH(obj)->ifnone, arg, limit);
@@ -700,13 +705,21 @@ dump_ensure(arg)
 
 /*
  * call-seq:
- *      dump( obj [, anIO] , limit1 ) anIO
+ *      dump( obj [, anIO] , limit , canonicalů±se ) anIO
+ *
+ * Serializes obj and all descendent objects.
+ *
+ * If +anIO+ is specified, the serialized data will be written to it,
+ * otherwise the data will be returned as a String.
  *
- * Serializes obj and all descendent objects. If anIO is
- * specified, the serialized data will be written to it, otherwise the
- * data will be returned as a String. If limit is specified, the
- * traversal of subobjects will be limited to that depth. If limit is
- * negative, no checking of depth will be performed.
+ * If +limit+ is specified, the traversal of subobjects will be limited
+ * to that depth. If limit is negative, no checking of depth will be
+ * performed.
+ *
+ * If +canonical+ is true, then Hash entries will always be stored in
+ * the same order, regardless of the Hash's internal state. This allows
+ * you to compare data structures by comparing their marshalled
+ * representations.
  *
  *     class Klass
  *       def initialize(str)
@@ -729,18 +742,31 @@ marshal_dump(argc, argv)
     int argc;
     VALUE* argv;
 {
-    VALUE obj, port, a1, a2;
+    VALUE obj, port, a1, a2, a3;
     int limit  1;
+    int canonical  ;
     struct dump_arg arg;
     struct dump_call_arg c_arg;
 
     port  nil;
-    rb_scan_args(argc, argv, "12", &obj, &a1, &a2);
-    if (argc 3) {
+    rb_scan_args(argc, argv, "13", &obj, &a1, &a2, &a3);
+    if (argc 4) {
+	canonical  TEST(a3);
 	if (!NIL_P(a2)) limit  UM2INT(a2);
 	if (NIL_P(a1)) goto type_error;
 	port  1;
     }
+    else if (argc 3) {
+	if (FIXNUM_P(a2)) {
+	   limit  UM2INT(a2);
+	   if (NIL_P(a1)) goto type_error;
+	   port  1;
+	} else {
+	   canonical  TEST(a2);
+	   limit  UM2INT(a1);
+	   port  nil;
+	}
+    }
     else if (argc 2) {
 	if (FIXNUM_P(a1)) limit  IX2INT(a1);
 	else if (NIL_P(a1)) goto type_error;
@@ -766,6 +792,7 @@ marshal_dump(argc, argv)
     arg.symbols  t_init_numtable();
     arg.data     t_init_numtable();
     arg.taint    false;
+    arg.canonical  anonical;
     c_arg.obj    bj;
     c_arg.arg    arg;
     c_arg.limit  imit;
@@ -1480,6 +1507,7 @@ Init_marshal()
     s_read  b_intern("read");
     s_write  b_intern("write");
     s_binmode  b_intern("binmode");
+    id_rehash  b_intern("rehash");
 
     rb_define_module_function(rb_mMarshal, "dump", marshal_dump, -1);
     rb_define_module_function(rb_mMarshal, "load", marshal_load, -1);

--------------070506010207040603010401--