Issue #6752 has been updated by matz (Yukihiro Matsumoto). OK, I agree with enhancement of String#encoding and String#scrub. Matz. ---------------------------------------- Feature #6752: Replacing ill-formed subsequencce https://bugs.ruby-lang.org/issues/6752#change-38729 Author: naruse (Yui NARUSE) Status: Assigned Priority: Normal Assignee: matz (Yukihiro Matsumoto) Category: core Target version: next minor =begin == æ¦?è¦? String??«ã?ªã??????????®ç????±ã?§ä??æ£ã?ªã????¤ã??????????«ã?¾ã????¦ã??????????«ã???????????ç½®æ?????å???§ç½®?????????????????? == ??¦ã?¼ã?¹ã?±ã?¼ã?? å®??????«ç¢ºèª?????????¦ã???????¦ã?¼ã?¹ã?±ã?¼ã?¹ã?¯ä»¥ä¸???®é???????§ã????? * twitter???title * IRC??®ã?ã?? * ?????³ã????³å????»ã?? API * Web??¯ã?ã?¼ã?ªã?³ã?? ???????????®ä??æ£ã?ªã????¤ã???????®ç??????? ????¯ã?????????????????????¤ã?????ä½???§æ??å?????????????è©°ã??????????«æ?«å°¾???????????¦ã?? ??«å°¾???????????????ä¸?æ£ã?ªæ??å???????ä½?????????¾ã?????ï¼????äº????ï¼? ???????????³ã?³ã???????«å?¥ã????????çµ???????????????¨ã?«ã????£ã?¦ã?????ä¸ã?«ã??æ··ã????£ã?????å???????ä½?????????¾ã?????ï¼?å¾?äº????ï¼? * https://twitter.com/takahashim/status/18974040397 * https://twitter.com/n0kada/status/215674740705210368 * https://twitter.com/n0kada/status/215686490070585346 * https://twitter.com/hajimehoshi/status/215671146769682432 * http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/ * http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8 == å¿?è¦???ªå?????: ç½®æ?????å? ?????¥å?½ã??String??? ????????©ã?«ã????¯ã??Unicodeç³»ã?ªã??U+FFFD?????????以å????§ã?¯ã????????? ????????©ã?«ã?????空æ??å???§ã?ªã???????±ã?¯ã???????¤ã????¦ã????¾ã???????¨ã?§ã??å¾???¥ã?¯å????¨ã????ªã????£ã???????¼ã?¯ã?³ã??ä½??????¦ã????¾ã????? ä¸?ä½???®ã?¬ã?¤ã?¤ã?¼ã?®è??å¼±æ?§ã?«ç????????????????§ã????? http://unicode.org/reports/tr36/#UTF-8_Exploit == API --- str.encode(str.encoding, invalid: replace, [replace: "???"]) * CSI???????????ªã???¦æ???????¡æ?ªã?? * iconv ??§ã?§ã???????®ã?? glibc iconv ??? GNU libiconv ??? //IGNORE ??¤ã??????????§ä????¯ã?§ã????ªã?? * å®?è£?ä¸???®ã?¡ã?ªã???????¯å??è¿°ã?®é??????????´æ????«å???????¦ã????¾ã????ªã??(??¨æ?????) == ??¥ã?¡ã?½ã????? * ??°ã???????¡ã?½ã???????§ã????? * fix/repair invalid/illegal bytes/sequence ???????????®å???????? == å®?è£? === 鬼è???????¼ã?? int ret = rb_enc_precise_mbclen(p, e, enc); ?????¦ã?? MBCLEN_INVALID_P(ret) ????????ªæ?????ä½??????¤ã????®ã??ä¸?æ£ã?ªã?®ã????????????ªã????®ã??å¾®å????? ONIGENC_CONSTRUCT_MBCLEN_INVALID() ????????¤ã????°ã??????????ªã????®ã??????????ªã?®ã?§ã?? 鬼è????®ã?¨ã?³ã?³ã?¼ã????£ã?³ã?°ã?¢ã?¸ã?¥ã?¼ã?«å?¨ã?¦ã?«å½±??¿ã????¦ã????¾ã???????????ä¿®æ£??°é?£ã?? ä¸?æ£ã?ªã????¤ã????¯ã?»ã?¨ã????©å????¨ã????ªã????¨ä»®å®??????¦ã????¹ç??????????²ã?«ã???????°å????¿ã?¯å?½ã?? === transcode?????¼ã?? UCSæ£è???????glibc iconv, GNU libiconv, Perl Encode??ªã?©ã?¨é????£ã?¦ã?? CSI???transcode??§ã?¯ã????ªå????ªèº«??«å?????????????´å????? ??¨ã?³ã?³ã?¼ã????£ã?³ã?°ã????¨ã?«ã??ä½?????????ªã?????å¤??????¢ã?¸ã?¥ã?¼ã?«ã????¨æ??????ªã????¨ã???????ªã????? ??¨ã???????????鬼è???????¼ã?¹ã?®ã?³ã?³ã?»ã?????å®?è£???¨ã????¹ã?????æ·»ä???????¦ã???????¾ã????? diff --git a/string.c b/string.c index d038835..4808f15 100644 --- a/string.c +++ b/string.c @@ -7426,6 +7426,199 @@ rb_str_ellipsize(VALUE str, long len) return ret; } +/* + * call-seq: + * str.fix_invalid -> new_str + * + * If the string is well-formed, it returns self. + * If the string has invalid byte sequence, repair it with given replacement + * character. + */ +VALUE +rb_str_fix_invalid(VALUE str) +{ + int cr = ENC_CODERANGE(str); + rb_encoding *enc; + if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) + return rb_str_dup(str); + + enc = STR_ENC_GET(str); + if (rb_enc_asciicompat(enc)) { + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + /* 10 should be enough for the usual use case, + * fixing a wrongly chopped character at the end of the string + */ + long room = 10; + VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room); + const char *rep; + if (enc == rb_utf8_encoding()) + rep = "\xEF\xBF\xBD"; + else + rep = "?"; + cr = ENC_CODERANGE_7BIT; + + p = search_nonascii(p, e); + if (!p) { + p = e; + } + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_CHARFOUND_P(ret)) { + if ((unsigned char)*p > 127) cr = ENC_CODERANGE_VALID; + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + const char *q; + long clen = rb_enc_mbmaxlen(enc); + if (p > p1) rb_str_buf_cat(buf, p1, p - p1); + q = RSTRING_END(buf); + + if (e - p < clen) clen = e - p; + if (clen < 3) { + clen = 1; + } + else { + long len = RSTRING_LEN(buf); + clen--; + rb_str_buf_cat(buf, p, clen); + for (; clen > 1; clen--) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_INVALID_P(ret)) { + continue; + } + else { + rb_bug("shouldn't reach here '%s'", q); + } + } + rb_str_set_len(buf, len); + } + p += clen; + p1 = p; + rb_str_buf_cat2(buf, rep); + p = search_nonascii(p, e); + if (!p) { + p = e; + break; + } + } + else if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else { + rb_bug("shouldn't reach here"); + } + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + rb_str_buf_cat2(buf, rep); + cr = ENC_CODERANGE_VALID; + } + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr); + return buf; + } + else if (rb_enc_dummy_p(enc)) { + return rb_str_dup(str); + } + else { + /* ASCII incompatible */ + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + /* 10 should be enough for the usual use case, + * fixing a wrongly chopped character at the end of the string + */ + long room = 10; + VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room); + const char *rep; + long mbminlen = rb_enc_mbminlen(enc); + static rb_encoding *utf16be; + static rb_encoding *utf16le; + static rb_encoding *utf32be; + static rb_encoding *utf32le; + if (!utf16be) { + utf16be = rb_enc_find("UTF-16BE"); + utf16le = rb_enc_find("UTF-16LE"); + utf32be = rb_enc_find("UTF-32BE"); + utf32le = rb_enc_find("UTF-32LE"); + } + if (enc == utf16be) { + rep = "\xFF\xFD"; + } + else if (enc == utf16le) { + rep = "\xFD\xFF"; + } + else if (enc == utf32be) { + rep = "\x00\x00\xFF\xFD"; + } + else if (enc == utf32le) { + rep = "\xFD\xFF\x00\x00"; + } + else { + rep = "?"; + } + + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_CHARFOUND_P(ret)) { + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + const char *q; + long clen = rb_enc_mbmaxlen(enc); + if (p > p1) rb_str_buf_cat(buf, p1, p - p1); + q = RSTRING_END(buf); + + if (e - p < clen) clen = e - p; + if (clen < mbminlen * 3) { + clen = mbminlen; + } + else { + long len = RSTRING_LEN(buf); + clen -= mbminlen; + rb_str_buf_cat(buf, p, clen); + for (; clen > mbminlen; clen-=mbminlen) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_INVALID_P(ret)) { + continue; + } + else { + rb_bug("shouldn't reach here '%s'", q); + } + } + rb_str_set_len(buf, len); + } + p += clen; + p1 = p; + rb_str_buf_cat2(buf, rep); + } + else if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else { + rb_bug("shouldn't reach here"); + } + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + rb_str_buf_cat2(buf, rep); + } + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID); + return buf; + } +} + /********************************************************************** * Document-class: Symbol * @@ -7882,6 +8075,7 @@ Init_String(void) rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); + rb_define_method(rb_cString, "fix_invalid", rb_str_fix_invalid, 0); rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 47f349c..2b0cfeb 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -2031,6 +2031,29 @@ class TestString < Test::Unit::TestCase assert_equal(u("\x82")+("\u3042"*9), ("\u3042"*10).byteslice(2, 28)) end + + def test_fix_invalid + assert_equal("\uFFFD\uFFFD\uFFFD", "\x80\x80\x80".fix_invalid) + assert_equal("\uFFFDA", "\xF4\x80\x80A".fix_invalid) + + # exapmles in Unicode 6.1.0 D93b + assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41", + "\x41\xC0\xAF\x41\xF4\x80\x80\x41".fix_invalid) + assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41", + "\x41\xE0\x9F\x80\x41".fix_invalid) + assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid) + + assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + "abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid) + + assert_equal("\uFFFD\u3042".encode("UTF-16BE"), + "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE). + fix_invalid) + assert_equal("\uFFFD\u3042".encode("UTF-16LE"), + "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE). + fix_invalid) + end end class TestString2 < TestString =end -- http://bugs.ruby-lang.org/