Issue #6752 has been updated by naruse (Yui NARUSE).


duerst (Martin Dürst) wrote:
> I have thought about this a bit. Yui's patch to string treats this as a problem separat from transcoding. I think it is preferable to use the transcoding logic to implement this. The advantage is that exactly the same options and fallbacks can be used, and if we add a new option or fallback to transcode, it will be usable, too.

This method doesn't need same options and fallbacks.
It need only invalid related, doesn't need undef related.
Moreover transcoder is usable only if Ruby has related transcoder of the target encoding.
But Ruby has some encodings which doesn't have transcoder for example emacs-mule.
Therefore this can't be built on transcoder.

> Some more notes: The checks for converting from one encoding to the same encoding are in str_transcode0. Anywhere else? We need some data to drive the conversion, but this should be easy to generate, and will be the same for many 8-bit encodings.

Yeah, I came to str_transcode0 and it is correct place.

The date we need is problem.
transcode doesn't have all the data though tool/transcode-tblgen.rb has some base data.
The only one which has all the data we need is enc/*.

> It will be easy to catch invalid byte sequences, but I'm not sure it's worth to check unassigned codepoints, at least not in Unicode.

If we need unassigned codepoints, we must define encodings more strictly.
Even if it is Unicode, it needs versions.
I don't think it's worth to check.
----------------------------------------
Feature #6752: Replacing ill-formed subsequencce
https://bugs.ruby-lang.org/issues/6752#change-38370

Author: naruse (Yui NARUSE)
Status: Assigned
Priority: Normal
Assignee: matz (Yukihiro Matsumoto)
Category: core
Target version: next minor


=begin
== ??
String????????????????????正?????????????????????????????????????????置????????置??????????????????

== ????????
??????確??????????????????????以???????????????
* twitter???title
* IRC?????
* ??????????????? API
* Web????????
?????????????正???????????????????????????????????????????????????????????????????詰???????????尾?????????????
??尾????????????????正?????????????????????????????????
??????????????????????????????????????????????????????中???混?????????????????????????????????????

* https://twitter.com/takahashim/status/18974040397
* https://twitter.com/n0kada/status/215674740705210368
* https://twitter.com/n0kada/status/215686490070585346
* https://twitter.com/hajimehoshi/status/215671146769682432
* http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
* http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8

== ?????????: 置??????
????????String???
???????????????Unicode系???U+FFFD?????????以??????????????
??????????????空?????????????????????????????????????????????????????????????????????????????????
???????????弱???????????????????????
http://unicode.org/reports/tr36/#UTF-8_Exploit

== API
--- str.encode(str.encoding, invalid: replace, [replace: "???"])
* CSI????????????????????????
* iconv ???????????? glibc iconv ??? GNU libiconv ??? //IGNORE ???????????????????????
* ????????????????述????????????????????????????????(???????)

== ?????????
* ??????????????????????
* fix/repair invalid/illegal bytes/sequence ???????????????????

== ??
=== 鬼?????????
int ret = rb_enc_precise_mbclen(p, e, enc); ???????
MBCLEN_INVALID_P(ret) ??????????????????????????正????????????????????微?????
ONIGENC_CONSTRUCT_MBCLEN_INVALID() ????????????????????????????????????????
鬼??????????????????????影?????????????????????修正?????
?正???????????????????????????仮????????????????????????????????????

=== transcode???????
UCS正???????glibc iconv, GNU libiconv, Perl Encode???????????
CSI???transcode???????????身????????????????????
????????????????????????????????????????????????????????????????????


?????????????鬼??????????????????????????????添???????????????????

 diff --git a/string.c b/string.c
 index d038835..4808f15 100644
 --- a/string.c
 +++ b/string.c
 @@ -7426,6 +7426,199 @@ rb_str_ellipsize(VALUE str, long len)
      return ret;
  }
  
 +/*
 + *  call-seq:
 + *    str.fix_invalid -> new_str
 + *
 + *  If the string is well-formed, it returns self.
 + *  If the string has invalid byte sequence, repair it with given replacement
 + *  character.
 + */
 +VALUE
 +rb_str_fix_invalid(VALUE str)
 +{
 +    int cr = ENC_CODERANGE(str);
 +    rb_encoding *enc;
 +    if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
 +	return rb_str_dup(str);
 +
 +    enc = STR_ENC_GET(str);
 +    if (rb_enc_asciicompat(enc)) {
 +	const char *p = RSTRING_PTR(str);
 +	const char *e = RSTRING_END(str);
 +	const char *p1 = p;
 +	/* 10 should be enough for the usual use case,
 +	 * fixing a wrongly chopped character at the end of the string
 +	 */
 +	long room = 10;
 +	VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room);
 +	const char *rep;
 +	if (enc == rb_utf8_encoding())
 +	    rep = "\xEF\xBF\xBD";
 +	else
 +	    rep = "?";
 +	cr = ENC_CODERANGE_7BIT;
 +
 +	p = search_nonascii(p, e);
 +	if (!p) {
 +	    p = e;
 +	}
 +	while (p < e) {
 +	    int ret = rb_enc_precise_mbclen(p, e, enc);
 +	    if (MBCLEN_CHARFOUND_P(ret)) {
 +		if ((unsigned char)*p > 127) cr = ENC_CODERANGE_VALID;
 +		p += MBCLEN_CHARFOUND_LEN(ret);
 +	    }
 +	    else if (MBCLEN_INVALID_P(ret)) {
 +		const char *q;
 +		long clen = rb_enc_mbmaxlen(enc);
 +		if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
 +		q = RSTRING_END(buf);
 +
 +		if (e - p < clen) clen = e - p;
 +		if (clen < 3) {
 +		    clen = 1;
 +		}
 +		else {
 +		    long len = RSTRING_LEN(buf);
 +		    clen--;
 +		    rb_str_buf_cat(buf, p, clen);
 +		    for (; clen > 1; clen--) {
 +			ret = rb_enc_precise_mbclen(q, q + clen, enc);
 +			if (MBCLEN_NEEDMORE_P(ret)) {
 +			    break;
 +			}
 +			else if (MBCLEN_INVALID_P(ret)) {
 +			    continue;
 +			}
 +			else {
 +			    rb_bug("shouldn't reach here '%s'", q);
 +			}
 +		    }
 +		    rb_str_set_len(buf, len);
 +		}
 +		p += clen;
 +		p1 = p;
 +		rb_str_buf_cat2(buf, rep);
 +		p = search_nonascii(p, e);
 +		if (!p) {
 +		    p = e;
 +		    break;
 +		}
 +	    }
 +	    else if (MBCLEN_NEEDMORE_P(ret)) {
 +		break;
 +	    }
 +	    else {
 +		rb_bug("shouldn't reach here");
 +	    }
 +	}
 +	if (p1 < p) {
 +	    rb_str_buf_cat(buf, p1, p - p1);
 +	}
 +	if (p < e) {
 +	    rb_str_buf_cat2(buf, rep);
 +	    cr = ENC_CODERANGE_VALID;
 +	}
 +	ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
 +	return buf;
 +    }
 +    else if (rb_enc_dummy_p(enc)) {
 +	return rb_str_dup(str);
 +    }
 +    else {
 +	/* ASCII incompatible */
 +	const char *p = RSTRING_PTR(str);
 +	const char *e = RSTRING_END(str);
 +	const char *p1 = p;
 +	/* 10 should be enough for the usual use case,
 +	 * fixing a wrongly chopped character at the end of the string
 +	 */
 +	long room = 10;
 +	VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room);
 +	const char *rep;
 +	long mbminlen = rb_enc_mbminlen(enc);
 +	static rb_encoding *utf16be;
 +	static rb_encoding *utf16le;
 +	static rb_encoding *utf32be;
 +	static rb_encoding *utf32le;
 +	if (!utf16be) {
 +	    utf16be = rb_enc_find("UTF-16BE");
 +	    utf16le = rb_enc_find("UTF-16LE");
 +	    utf32be = rb_enc_find("UTF-32BE");
 +	    utf32le = rb_enc_find("UTF-32LE");
 +	}
 +	if (enc == utf16be) {
 +	    rep = "\xFF\xFD";
 +	}
 +	else if (enc == utf16le) {
 +	    rep = "\xFD\xFF";
 +	}
 +	else if (enc == utf32be) {
 +	    rep = "\x00\x00\xFF\xFD";
 +	}
 +	else if (enc == utf32le) {
 +	    rep = "\xFD\xFF\x00\x00";
 +	}
 +	else {
 +	    rep = "?";
 +	}
 +
 +	while (p < e) {
 +	    int ret = rb_enc_precise_mbclen(p, e, enc);
 +	    if (MBCLEN_CHARFOUND_P(ret)) {
 +		p += MBCLEN_CHARFOUND_LEN(ret);
 +	    }
 +	    else if (MBCLEN_INVALID_P(ret)) {
 +		const char *q;
 +		long clen = rb_enc_mbmaxlen(enc);
 +		if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
 +		q = RSTRING_END(buf);
 +
 +		if (e - p < clen) clen = e - p;
 +		if (clen < mbminlen * 3) {
 +		    clen = mbminlen;
 +		}
 +		else {
 +		    long len = RSTRING_LEN(buf);
 +		    clen -= mbminlen;
 +		    rb_str_buf_cat(buf, p, clen);
 +		    for (; clen > mbminlen; clen-=mbminlen) {
 +			ret = rb_enc_precise_mbclen(q, q + clen, enc);
 +			if (MBCLEN_NEEDMORE_P(ret)) {
 +			    break;
 +			}
 +			else if (MBCLEN_INVALID_P(ret)) {
 +			    continue;
 +			}
 +			else {
 +			    rb_bug("shouldn't reach here '%s'", q);
 +			}
 +		    }
 +		    rb_str_set_len(buf, len);
 +		}
 +		p += clen;
 +		p1 = p;
 +		rb_str_buf_cat2(buf, rep);
 +	    }
 +	    else if (MBCLEN_NEEDMORE_P(ret)) {
 +		break;
 +	    }
 +	    else {
 +		rb_bug("shouldn't reach here");
 +	    }
 +	}
 +	if (p1 < p) {
 +	    rb_str_buf_cat(buf, p1, p - p1);
 +	}
 +	if (p < e) {
 +	    rb_str_buf_cat2(buf, rep);
 +	}
 +	ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID);
 +	return buf;
 +    }
 +}
 +
  /**********************************************************************
   * Document-class: Symbol
   *
 @@ -7882,6 +8075,7 @@ Init_String(void)
      rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
      rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
      rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
 +    rb_define_method(rb_cString, "fix_invalid", rb_str_fix_invalid, 0);
  
      rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
      rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
 diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb
 index 47f349c..2b0cfeb 100644
 --- a/test/ruby/test_string.rb
 +++ b/test/ruby/test_string.rb
 @@ -2031,6 +2031,29 @@ class TestString < Test::Unit::TestCase
  
      assert_equal(u("\x82")+("\u3042"*9), ("\u3042"*10).byteslice(2, 28))
    end
 +
 +  def test_fix_invalid
 +    assert_equal("\uFFFD\uFFFD\uFFFD", "\x80\x80\x80".fix_invalid)
 +    assert_equal("\uFFFDA", "\xF4\x80\x80A".fix_invalid)
 +
 +    # exapmles in Unicode 6.1.0 D93b
 +    assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41",
 +                 "\x41\xC0\xAF\x41\xF4\x80\x80\x41".fix_invalid)
 +    assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41",
 +                 "\x41\xE0\x9F\x80\x41".fix_invalid)
 +    assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
 +                 "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid)
 +
 +    assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
 +                 "abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid)
 +
 +    assert_equal("\uFFFD\u3042".encode("UTF-16BE"),
 +                 "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE).
 +                 fix_invalid)
 +    assert_equal("\uFFFD\u3042".encode("UTF-16LE"),
 +                 "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE).
 +                 fix_invalid)
 +  end
  end
  
  class TestString2 < TestString
=end



-- 
http://bugs.ruby-lang.org/