This is a multi-part message in MIME format.
--------------070506010207040603010401
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit
Currently Marshal.dump stores Hash entries in no specific order. This
prevents one from reliably comparing marshalled data in order to
determine whether a data structure changed. For example, consider the
following code:
hash 'root' [1, [1, 1.5], 3, 4], 'foo' 'zort' }
data arshal.dump(hash)
data2 arshal.dump(Marshal.load(data))
"data ! ata2" might be possible.
PStore uses an MD5 checksum of the marshalled data to check whether the
file needs to be updated. Marshal.dump's non-deterministic hash ordering
gets in the way.
The attached patch adds a 'canonical' argument to Marshal.dump. This
argument allows Hash entries to be stored in a deterministic order, at
the cost of a slight performance hit. (PStore will have to be modified
to take advantage of this, but I'm already working on that.) The patch
is made against Ruby 1.8.6.
Regards,
Hongli Lai
--------------070506010207040603010401
Content-Type: text/x-patch;
name uby-marshal-dump-canonical.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename uby-marshal-dump-canonical.diff"
diff --git a/marshal.c b/marshal.c
index 185ace6..8643894 100644
--- a/marshal.c
+++ b/marshal.c
@@ -83,6 +83,7 @@ shortlen(len, ds)
static ID s_dump, s_load, s_mdump, s_mload;
static ID s_dump_data, s_load_data, s_alloc;
static ID s_getc, s_read, s_write, s_binmode;
+static ID id_rehash;
struct dump_arg {
VALUE obj;
@@ -90,6 +91,7 @@ struct dump_arg {
st_table *symbols;
st_table *data;
int taint;
+ int canonical;
};
struct dump_call_arg {
@@ -621,6 +623,9 @@ w_object(obj, arg, limit)
w_byte(TYPE_HASH_DEF, arg);
}
w_long(RHASH(obj)->tbl->num_entries, arg);
+ if (arg->canonical) {
+ rb_funcall(obj, id_rehash, 0);
+ }
rb_hash_foreach(obj, hash_each, (st_data_t)&c_arg);
if (!NIL_P(RHASH(obj)->ifnone)) {
w_object(RHASH(obj)->ifnone, arg, limit);
@@ -700,13 +705,21 @@ dump_ensure(arg)
/*
* call-seq:
- * dump( obj [, anIO] , limit 1 ) anIO
+ * dump( obj [, anIO] , limit , canonicalů±se ) anIO
+ *
+ * Serializes obj and all descendent objects.
+ *
+ * If +anIO+ is specified, the serialized data will be written to it,
+ * otherwise the data will be returned as a String.
*
- * Serializes obj and all descendent objects. If anIO is
- * specified, the serialized data will be written to it, otherwise the
- * data will be returned as a String. If limit is specified, the
- * traversal of subobjects will be limited to that depth. If limit is
- * negative, no checking of depth will be performed.
+ * If +limit+ is specified, the traversal of subobjects will be limited
+ * to that depth. If limit is negative, no checking of depth will be
+ * performed.
+ *
+ * If +canonical+ is true, then Hash entries will always be stored in
+ * the same order, regardless of the Hash's internal state. This allows
+ * you to compare data structures by comparing their marshalled
+ * representations.
*
* class Klass
* def initialize(str)
@@ -729,18 +742,31 @@ marshal_dump(argc, argv)
int argc;
VALUE* argv;
{
- VALUE obj, port, a1, a2;
+ VALUE obj, port, a1, a2, a3;
int limit 1;
+ int canonical ;
struct dump_arg arg;
struct dump_call_arg c_arg;
port nil;
- rb_scan_args(argc, argv, "12", &obj, &a1, &a2);
- if (argc 3) {
+ rb_scan_args(argc, argv, "13", &obj, &a1, &a2, &a3);
+ if (argc 4) {
+ canonical TEST(a3);
if (!NIL_P(a2)) limit UM2INT(a2);
if (NIL_P(a1)) goto type_error;
port 1;
}
+ else if (argc 3) {
+ if (FIXNUM_P(a2)) {
+ limit UM2INT(a2);
+ if (NIL_P(a1)) goto type_error;
+ port 1;
+ } else {
+ canonical TEST(a2);
+ limit UM2INT(a1);
+ port nil;
+ }
+ }
else if (argc 2) {
if (FIXNUM_P(a1)) limit IX2INT(a1);
else if (NIL_P(a1)) goto type_error;
@@ -766,6 +792,7 @@ marshal_dump(argc, argv)
arg.symbols t_init_numtable();
arg.data t_init_numtable();
arg.taint false;
+ arg.canonical anonical;
c_arg.obj bj;
c_arg.arg arg;
c_arg.limit imit;
@@ -1480,6 +1507,7 @@ Init_marshal()
s_read b_intern("read");
s_write b_intern("write");
s_binmode b_intern("binmode");
+ id_rehash b_intern("rehash");
rb_define_module_function(rb_mMarshal, "dump", marshal_dump, -1);
rb_define_module_function(rb_mMarshal, "load", marshal_load, -1);
--------------070506010207040603010401--