Issue #9816 has been updated by Yui NARUSE. 熟考の結果、Gem::Versionと仕様をあわせました。 理由は、 * Gem::Versionでこれを使ってくれればオブジェクトの生成数が減る * 2.2.0-preview1のようなRubyのバージョンの比較ができる からです。 順序のイメージは Prereleases sort between real releases (newest to oldest) のような感じです 1. 1.0 2. 1.0.b1 3. 1.0.a.2 4. 0.9 ```c diff --git a/string.c b/string.c index bec0bfd..e2b3c6f 100644 --- a/string.c +++ b/string.c @@ -2605,6 +2605,232 @@ rb_str_casecmp(VALUE str1, VALUE str2) return INT2FIX(-1); } +static int +version_string_p(VALUE str) +{ + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + + if (!rb_enc_asciicompat(STR_ENC_GET(str))) return FALSE; + + if (!ISDIGIT(*p)) return FALSE; + do { if (++p >= e) return TRUE; } while (ISDIGIT(*p)); + + while (*p == '.') { + if (++p >= e) return FALSE; + if (!ISALNUM(*p)) return FALSE; + do { if (++p >= e) return TRUE; } while (ISALNUM(*p)); + } + + if (*p != '-') return FALSE; + do { + if (++p >= e) return FALSE; + if (!ISALNUM(*p) && *p != '-') return FALSE; + do { if (++p >= e) return TRUE; } while (ISALNUM(*p) || *p == '-'); + } while (*p == '.'); + + return FALSE; +} + +/* return value: whether end of nueric part is EOS + * sp: first nonzero digit + * ep: end of digits + */ +static void +search_numerical_str(const char **sp, const char **ep) +{ + const char *p = *sp; + const char *e = *ep; + assert(p < e); + for (;;) { + if (*p != '0') break; + p++; + if (p == e) { + *sp = p; + goto finish; + } + } + *sp = p; + assert(p < e); + for (;;) { + if (!ISDIGIT(*p)) break; + p++; + if (p == e) { + goto finish; + } + } +finish: + *ep = p; + return; +} + +static VALUE +numerical_compare(const char **pp1, const char *p1end, const char **pp2, const char *p2end) +{ + const char *s1 = *pp1, *p1=p1end, *s2 = *pp2, *p2=p2end; + ptrdiff_t len1, len2; + int r; + + search_numerical_str(&s1, &p1); + search_numerical_str(&s2, &p2); + + /* compre digits length */ + len1 = p1 - s1; + len2 = p2 - s2; + if (len1 != len2) return INT2FIX(len1 < len2 ? -1 : 1); + + /* compre numeric value */ + r = memcmp(s1, s2, len1); + if (r) return r < 0 ? INT2FIX(-1) : INT2FIX(1); + + *pp1 = p1; + *pp2 = p2; + return Qnil; +} + +/* + * call-seq: + * str.versioncmp(other_str) -> -1, 0, +1 or nil + * + * Compare strings as version strings. + * + * "a1".versioncmp("a1") #=> 0 + * "aa".versioncmp("a1") #=> 1 + * "a1".versioncmp("aa") #=> -1 + * "a1".versioncmp("a01") #=> -1 + * "2.1.2".numericcmp("2.1.10") #=> 1 + */ + +static VALUE +rb_str_versioncmp(VALUE str1, VALUE str2) +{ + const char *p, *pe, *q, *qe; + + StringValue(str2); + if (!version_string_p(str1)) { + rb_raise(rb_eArgError, "receiver is not version string '%+"PRIsVALUE"'", str1); + } + if (!version_string_p(str2)) { + rb_raise(rb_eArgError, "argument is not version string '%+"PRIsVALUE"'", str2); + } + + p = RSTRING_PTR(str1); pe = RSTRING_END(str1); + q = RSTRING_PTR(str2); qe = RSTRING_END(str2); + + for (;;) { + if (*p == '-') { +hyphen_left: + if (*q == '-') goto next_char; + while (*q == '.') { + if (++q == qe) return INT2FIX(1); + } + if (*q != 'p') return INT2FIX(ISDIGIT(*q) || 'p' < *q ? -1 : 1); + if (++q == qe) return INT2FIX(1); + if (*q != 'r') return INT2FIX(ISDIGIT(*q) || 'r' < *q ? -1 : 1); + if (++q == qe) return INT2FIX(1); + if (*q != 'e') return INT2FIX(ISDIGIT(*q) || 'e' < *q ? -1 : 1); + if (++q == qe) return INT2FIX(1); + if (*q != '.') { + if (*q == '-') { + p++; + goto hyphen_right; + } + else if (ISALPHA(*q)) return INT2FIX(-1); + q--; /* DIGIT */ + } + } + else if (*q == '-') { +hyphen_right: + if (*p == '-') goto next_char; + while (*p == '.') { + if (++p == pe) return INT2FIX(-1); + } + if (*p != 'p') return INT2FIX(ISDIGIT(*p) || 'p' < *p ? 1 : -1); + if (++p == pe) return INT2FIX(-1); + if (*p != 'r') return INT2FIX(ISDIGIT(*p) || 'r' < *p ? 1 : -1); + if (++p == pe) return INT2FIX(-1); + if (*p != 'e') return INT2FIX(ISDIGIT(*p) || 'e' < *p ? 1 : -1); + if (++p == pe) return INT2FIX(-1); + if (*p == '-') { + q++; + goto hyphen_left; + } + else if (ISALPHA(*p)) return INT2FIX(1); + else if (ISDIGIT(*p)) { + p--; /* DIGIT */ + } + } + else if (ISDIGIT(*p)) { + if (ISDIGIT(*q)) { + VALUE r = numerical_compare(&p, pe, &q, qe); + if(!NIL_P(r)) return r; + goto incremented; + } + else { + return INT2FIX(1); + } + } + else if (ISDIGIT(*q)) { + return INT2FIX(-1); + } + else if (ISALPHA(*p)) { + if (ISALPHA(*q)) { + for (;;) { + if (*p != *q) return INT2FIX(*p < *q ? -1 : 1); + p++; + q++; + if (p == pe) { + if (q == qe) return INT2FIX(0); + if (ISALPHA(*q)) return INT2FIX(-1); + goto incremented; + } + else if (q == qe) { + if (ISALPHA(*p)) return INT2FIX(1); + goto incremented; + } + else if (ISALPHA(*p)) { + if (!ISALPHA(*q)) return INT2FIX(1); + } + else if (ISALPHA(*q)) return INT2FIX(-1); + else goto incremented; + } + continue; + } + else return INT2FIX(1); + } + else if (ISALPHA(*q)) { + return INT2FIX(-1); + } + else rb_bug("%s %s",p,q); + +next_char: + p++; + q++; + +incremented: + while (*p == '.' && ++p != pe); + while (*q == '.' && ++q != qe); + if (p == pe) { + if (q == qe) return INT2FIX(0); + if (ISDIGIT(*q)) { + return INT2FIX(-1); + } + else /*if (ISALPHA(*q) || *q == '-')*/ { + return INT2FIX(1); + } + } + else if (q == qe) { + if (ISDIGIT(*p)) { + return INT2FIX(1); + } + else /*if (ISALPHA(*p) || *p == '-')*/ { + return INT2FIX(-1); + } + } + } + UNREACHABLE; +} + #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0) static long @@ -8778,6 +9004,7 @@ Init_String(void) rb_define_method(rb_cString, "eql?", rb_str_eql, 1); rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); + rb_define_method(rb_cString, "versioncmp", rb_str_versioncmp, 1); rb_define_method(rb_cString, "+", rb_str_plus, 1); rb_define_method(rb_cString, "*", rb_str_times, 1); rb_define_method(rb_cString, "%", rb_str_format_m, 1); diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index e8decc0..9e92fb7 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -2112,6 +2112,41 @@ def test_casecmp assert_equal(1, "\u3042B".casecmp("\u3042a")) end + def test_versioncmp + require "rubygems" + ary = %w[ + 1 + 2 + 10 + 1.a + 1.a-a + 1.a-- + 1.a--.- + 1.a-1 + 1.a.q + 1.a--a + 1.a--1 + 1.a.pre.a + 1.a-pre.a + 1.a.pre-a + 1.a1 + 1.a2 + 1.aa + 1.b + 1.01 + 1.1 + 1.1a + 1.1-a + 1.1-b + 1.1q + 1.2 + 1.10 + ] + ary.product(ary) do |a, b| + assert_equal(Gem::Version.new(a)<=>Gem::Version.new(b), a.versioncmp(b), "#{a.dump}, #{b.dump}") + end + end + def test_upcase2 assert_equal("\u3042AB", "\u3042aB".upcase) end ``` ---------------------------------------- Feature #9816: 文字列内の数字を数値として比較するメソッド https://bugs.ruby-lang.org/issues/9816#change-48653 * Author: Yui NARUSE * Status: Assigned * Priority: Normal * Assignee: Yukihiro Matsumoto * Category: core * Target version: ---------------------------------------- 文字列内の数字を数値として比較するメソッドを追加しませんか そのような比較は一般的な用途としてはGUIシェルのファイラーが比較に用いており、 Windows では StrCmpLogicalW が、OS X では NSString:compare:options:へのNSNumericSearch定数が提供されています。 http://msdn.microsoft.com/en-us/library/windows/desktop/bb759947(v=vs.85).aspx https://developer.apple.com/library/mac/documentation/Cocoa/Reference/Foundation/Classes/NSString_Class/Reference/NSString.html#//apple_ref/c/econst/NSNumericSearch 上記のような処理自体はさほど難しいものではありませんが、Rubyレベルで実装すると大量のオブジェクトを作ってしまいます。 例えば `Gem::Version.new("2.1.10".freeze)<=>Gem::Version.new("2.1.9".freeze)` は47個、 `"2.1.10".freeze.split('.').map(&:to_i)<=>"2.1.9".freeze.split('.').map(&:to_i)` だと16個のオブジェクトを作ります。 `"2.1.10".freeze.numericcmp"2.1.9".freeze` ならば、もちろんオブジェクトは一つも作りません。 なお、上記の例でも示唆していますが、本メソッドは Ruby のバージョン表記の TEENY が2桁になった場合の比較に用いることができます。 パッチは以下の通りです。 なお、メソッド名は String#numericcmp としています。 (String#casecmpを念頭に置いた) ``` diff --git a/string.c b/string.c index c589c80..66f667f 100644 --- a/string.c +++ b/string.c @@ -2569,6 +2569,131 @@ rb_str_casecmp(VALUE str1, VALUE str2) return INT2FIX(-1); } +VALUE +numerical_compare(const char **pp1, const char *p1end, const char **pp2, const char *p2end) +{ + const char *s1 = *pp1, *p1, *s2 = *pp2, *p2; + ptrdiff_t len1, len2; + int r; + + while (s1 < p1end && *s1 == '0') s1++; + p1 = s1; + while (p1 < p1end && ISDIGIT(*p1)) p1++; + len1 = p1 - s1; + + while (s2 < p2end && *s2 == '0') s2++; + p2 = s2; + while (p2 < p2end && ISDIGIT(*p2)) p2++; + len2 = p2 - s2; + + if (len1 != len2) { + return INT2FIX(len1 < len2 ? -1 : 1); + } + + r = memcmp(s1, s2, len1); + if (r) return r < 0 ? INT2FIX(-1) : INT2FIX(1); + + len1 = s1 - *pp1; + len2 = s2 - *pp2; + if (len1 != len2) { + return INT2FIX(len1 < len2 ? -1 : 1); + } + + *pp1 = p1; + *pp2 = p2; + return Qnil; +} + +/* + * call-seq: + * str.numericcmp(other_str) -> -1, 0, +1 or nil + * + * Variant of <code>String#<=></code>, which considers digits in strings + * are numeric value.. + * + * "a1".numericcmp("a1") #=> 0 + * "aa".numericcmp("a1") #=> 1 + * "a1".numericcmp("aa") #=> -1 + * "a1".numericcmp("a01") #=> -1 + * "2.1.2".numericcmp("2.1.10") #=> 1 + */ + +static VALUE +rb_str_numericcmp(VALUE str1, VALUE str2) +{ + long len; + rb_encoding *enc; + const char *p1, *p1end, *p2, *p2end; + + StringValue(str2); + enc = rb_enc_compatible(str1, str2); + if (!enc) { + return Qnil; + } + + p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1); + p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2); + if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { + while (p1 < p1end && p2 < p2end) { + if (ISDIGIT(*p1)) { + if (ISDIGIT(*p2)) { + VALUE r = numerical_compare(&p1, p1end, &p2, p2end); + if (!NIL_P(r)) return r; + } + else { + return INT2FIX(-1); + } + } + else if (ISDIGIT(*p2)) { + return INT2FIX(1); + } + if (*p1 != *p2) return INT2FIX(*p1 < *p2 ? -1 : 1); + p1++; + p2++; + } + } + else { + while (p1 < p1end && p2 < p2end) { + int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc); + int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc); + + if (0 <= c1 && 0 <= c2) { + if (ISDIGIT(*p1)) { + if (ISDIGIT(*p2)) { + VALUE r = numerical_compare(&p1, p1end, &p2, p2end); + if (!NIL_P(r)) return r; + } + else { + return INT2FIX(-1); + } + } + else if (ISDIGIT(*p2)) { + return INT2FIX(1); + } + if (*p1 != *p2) return INT2FIX(*p1 < *p2 ? -1 : 1); + p1++; + p2++; + } + else { + int r; + l1 = rb_enc_mbclen(p1, p1end, enc); + l2 = rb_enc_mbclen(p2, p2end, enc); + len = l1 < l2 ? l1 : l2; + r = memcmp(p1, p2, len); + if (r != 0) + return INT2FIX(r < 0 ? -1 : 1); + if (l1 != l2) + return INT2FIX(l1 < l2 ? -1 : 1); + } + p1 += l1; + p2 += l2; + } + } + if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); + if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1); + return INT2FIX(-1); +} + static long rb_str_index(VALUE str, VALUE sub, long offset) { @@ -8721,6 +8846,7 @@ Init_String(void) rb_define_method(rb_cString, "eql?", rb_str_eql, 1); rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); + rb_define_method(rb_cString, "numericcmp", rb_str_numericcmp, 1); rb_define_method(rb_cString, "+", rb_str_plus, 1); rb_define_method(rb_cString, "*", rb_str_times, 1); rb_define_method(rb_cString, "%", rb_str_format_m, 1); diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 8366424..f9c788b 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -2104,6 +2104,29 @@ class TestString < Test::Unit::TestCase assert_equal(1, "\u3042B".casecmp("\u3042a")) end + def test_numericcmp + assert_equal(-1, "2.1.0".numericcmp("2.1.1")) + assert_equal(-1, "2.1.9".numericcmp("2.1.10")) + assert_equal( 0, "a1".numericcmp("a1")) + assert_equal( 1, "aa".numericcmp("a1")) + assert_equal(-1, "a1".numericcmp("aa")) + assert_equal(-1, "a1".numericcmp("a01")) + assert_equal(-1, "a0001".numericcmp("a00001")) + assert_equal( 0, "a1a".numericcmp("a1a")) + assert_equal( 1, "a1b".numericcmp("a1a")) + assert_equal(-1, "a9a".numericcmp("a10a")) + assert_equal( 1, "b".numericcmp("a")) + assert_equal( 0, "\u30421".numericcmp("\u30421")) + assert_equal( 1, "\u3042\u3042".numericcmp("\u30421")) + assert_equal(-1, "\u30421".numericcmp("\u3042\u3042")) + assert_equal(-1, "\u30421".numericcmp("\u304201")) + assert_equal(-1, "\u30420001".numericcmp("\u304200001")) + assert_equal( 0, "\u30421\u3042".numericcmp("\u30421\u3042")) + assert_equal( 1, "\u30421\u3044".numericcmp("\u30421\u3042")) + assert_equal(-1, "\u30429\u3042".numericcmp("\u304210\u3042")) + assert_equal( 1, "\u3044".numericcmp("\u3042")) + end + def test_upcase2 assert_equal("\u3042AB", "\u3042aB".upcase) end ``` -- https://bugs.ruby-lang.org/