Hi,

At Tue, 27 Jul 2010 22:21:31 +0900,
Heesob Park wrote in [ruby-core:31512]:
> I noticed String#inspect results \x{XXXX} for the encoding other than Unicode.
> 
> Is there any possibility that \x{XXXX} is accepted as an escape sequence of string?
> 
> irb(main):004:0> a = "\xC7\xD1\xB1\xDB"

This is in binary representation.

> irb(main):010:0> a[1]
> => "\x{B1DB}"

But this is in codepoint representation.

I'm afraid it may confuse users.


diff --git a/parse.y b/parse.y index ba52135..ec13fb6 100644 --- a/parse.y +++ b/parse.y @@ -5456,8 +5456,8 @@ parser_tok_hex(struct parser_params *parser, size_t *numlen) #define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n)) static int -parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp, - int string_literal, int symbol_literal, int regexp_literal) +parser_tokadd_multibyte(struct parser_params *parser, rb_encoding **encp, int enctype, + int string_literal, int symbol_literal, int regexp_literal) { /* * If string_literal is true, then we allow multiple codepoints @@ -5466,22 +5466,28 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp, * codepoint without adding it */ - int codepoint; - size_t numlen; + int codepoint, unicode_p = enctype == 'u', mblen; + size_t numlen, maxlen; + char errmsg[64]; + const char *encname = unicode_p ? "Unicode" : (*encp)->name; - if (regexp_literal) { tokadd('\\'); tokadd('u'); } + if (regexp_literal) { tokadd('\\'); tokadd(enctype); } if (peek('{')) { /* handle \u{...} form */ + maxlen = unicode_p ? 6 : 4; do { if (regexp_literal) { tokadd(*lex_p); } nextc(); - codepoint = scan_hex(lex_p, 6, &numlen); + codepoint = scan_hex(lex_p, maxlen, &numlen); if (numlen == 0) { - yyerror("invalid Unicode escape"); + snprintf(errmsg, sizeof(errmsg), "invalid %s escape", encname); + yyerror(errmsg); return 0; } - if (codepoint > 0x10ffff) { - yyerror("invalid Unicode codepoint (too large)"); + mblen = ONIGENC_CODE_TO_MBCLEN(unicode_p ? UTF8_ENC() : *encp, codepoint); + if (!MBCLEN_CHARFOUND_P(mblen)) { + snprintf(errmsg, sizeof(errmsg), "invalid %s codepoint", encname); + yyerror(errmsg); return 0; } lex_p += numlen; @@ -5489,7 +5495,7 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp, tokcopy((int)numlen); } else if (codepoint >= 0x80) { - *encp = UTF8_ENC(); + if (unicode_p) *encp = UTF8_ENC(); if (string_literal) tokaddmbc(codepoint, *encp); } else if (string_literal) { @@ -5506,16 +5512,18 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp, nextc(); } else { /* handle \uxxxx form */ - codepoint = scan_hex(lex_p, 4, &numlen); - if (numlen < 4) { - yyerror("invalid Unicode escape"); + maxlen = unicode_p ? 4 : 2; + codepoint = scan_hex(lex_p, maxlen, &numlen); + if (numlen < maxlen) { + snprintf(errmsg, sizeof(errmsg), "invalid %s escape", encname); + yyerror(errmsg); return 0; } - lex_p += 4; + lex_p += numlen; if (regexp_literal) { - tokcopy(4); + tokcopy(numlen); } - else if (codepoint >= 0x80) { + else if (codepoint >= 0x80 && unicode_p) { *encp = UTF8_ENC(); if (string_literal) tokaddmbc(codepoint, *encp); } @@ -5570,6 +5578,9 @@ parser_read_escape(struct parser_params *parser, int flags, return c; case 'x': /* hex constant */ + if (peek('{')) { + + } c = tok_hex(&numlen); if (numlen == 0) return 0; return c; @@ -5825,13 +5836,14 @@ parser_tokadd_string(struct parser_params *parser, break; case 'u': + case 'x': if ((func & STR_FUNC_EXPAND) == 0) { tokadd('\\'); break; } - parser_tokadd_utf8(parser, &enc, 1, - func & STR_FUNC_SYMBOL, - func & STR_FUNC_REGEXP); + parser_tokadd_multibyte(parser, &enc, c, 1, + func & STR_FUNC_SYMBOL, + func & STR_FUNC_REGEXP); if (has_nonascii && enc != *encp) { mixed_escape(beg, enc, *encp); } @@ -6855,9 +6867,9 @@ parser_yylex(struct parser_params *parser) goto ternary; } else if (c == '\\') { - if (peek('u')) { - nextc(); - c = parser_tokadd_utf8(parser, &enc, 0, 0, 0); + c = nextc(); + if (c == 'u' || c == 'x') { + c = parser_tokadd_multibyte(parser, &enc, c, 0, 0, 0); if (0x80 <= c) { tokaddmbc(c, enc); } @@ -6866,6 +6878,7 @@ parser_yylex(struct parser_params *parser) } } else { + pushback(c); c = read_escape(0, &enc); tokadd(c); } diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index 07cda75..28996ab 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -159,6 +159,9 @@ class TestM17N < Test::Unit::TestCase assert_encoding("EUC-JP", eval(e(%{"\\x20"})).encoding) assert_encoding("EUC-JP", eval(e(%{"\\n"})).encoding) assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding) + str = eval(e(%{"\\x{a1a1}"})) + assert_encoding("EUC-JP", str.encoding) + assert_equal(0xa1a1, str.ord) end def test_utf8_literal
-- Nobu Nakada