This is a multi-part message in MIME format. --------------000303010108010007030107 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Back at the end of August, Matz wrote (see http://www.ruby-forum.com/topic/123224#548617): > > We just had a meeting to discuss about issues like this yesterday. > And the end result was > > \xXX -> single byte > \uXXXX -> single Unicode character by codepoint (BMP) > \u{XXXXXXXX} -> single Unicode character up to 4 bytes I've attached a patch to parse.y that implements these two forms of \u escapes within string literals. Please treat this as proof-of-concept only. I hope it proves useful, but I am too unfamiliar with parse.y to know what I'm doing wrong here (and I'm sure I'm doing something). Despite the fact that the u in "\u" stands for Unicode, I wrote this patch to work with whatever the parser->enc encoding was. If you run Ruby with -Ku, you get Unicode codepoints translated to UTF-8, as expected. But with -Ks or -Ke, it takes whatever codepoint you specify and translates it to the appropriate multibyte sequence for SJIS or EUC. (I don't know how to test this, however.) If you run the parser in ASCII mode, then \u1234 is equivalent to \x12\x34. Similarly for the \u{} form. I don't know if this is actually a good idea or not. There is a strong case to be made that \u should always create utf-8 bytes regardless of the encoding the parser is using. David --------------000303010108010007030107 Content-Type: text/plain; name atch" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename atch" Index: parse.y --- parse.y (revision 13739) +++ parse.y (working copy) @@ -4975,6 +4975,38 @@ if (mb && (c > x80)) *mb NC_CODERANGE_UNKNOWN; return c; + case 'u': /* Unicode constant */ + { + int numlen; + int close; + + if ((c extc()) '{') { + c can_hex(lex_p, 8, &numlen); + if (numlen < 2) { + yyerror("Invalid Unicode escape"); + return 0; + } + lex_p + umlen; + + if ((close extc()) ! }') { + pushback(close); + yyerror("Unterminated Unicode escape"); + return 0; + } + } + else { + pushback(c); + c can_hex(lex_p, 4, &numlen); + if (numlen < 4) { + yyerror("Invalid Unicode escape"); + return 0; + } + lex_p + umlen; + } + } + if (mb && (c > x80)) *mb NC_CODERANGE_UNKNOWN; + return c; + case 'b': /* backspace */ return '\010'; @@ -5068,6 +5100,47 @@ } return 0; + case 'u': /* Unicode constant */ + { + int numlen; + int hex; + + tokadd('\\'); + tokadd(c); + + if ((c extc()) '{') { + hex can_hex(lex_p, 8, &numlen); + if (numlen < 2) { + yyerror("Invalid Unicode escape"); + return -1; + } + tokadd('{'); + while (numlen--) + tokadd(nextc()); + if ((c extc()) '}') + tokadd(c); + else { + pushback(c); + yyerror("Unterminated Unicode escape"); + return -1; + } + } + else { + pushback(c); + hex can_hex(lex_p, 4, &numlen); + if (numlen < 4) { + yyerror("Invalid Unicode escape"); + return -1; + } + + while (numlen--) + tokadd(nextc()); + } + + if (mb && (hex > x80)) *mb NC_CODERANGE_UNKNOWN; + } + return 0; + case 'M': if ((c extc()) ! -') { yyerror("Invalid escape character syntax"); @@ -5230,6 +5303,31 @@ pushback(c); if (func & STR_FUNC_ESCAPE) tokadd('\\'); c ead_escape(mb); + + /* + * The \u escape can return codepoints > 255. + * Encode these using the current encoding. + * If the encoding is a single-byte encoding, then + * treat \u as a repeated \x and just encode the bytes + */ + if (c > 255) { + int maxbytes b_enc_mbmaxlen(parser->enc); + UChar buf[maxbytes]; + if (maxbytes > 1) { + int i,n; + n b_enc_mbcput(c,buf,parser->enc); + for(i i < n; i++) tokadd(buf[i]); + } + else { + /* in ASCII, \uWXYZ is just \xWX\xYZ */ + /* \u{12345678} is \x12\x34\x56\x78 */ + if (c & 0xff000000) tokadd(c >> 24); + if (c & 0x00ff0000) tokadd((c >> 16) & 0xff); + if (c & 0x0000ff00) tokadd((c >> 8) & 0xff); + tokadd(c & 0xff); + } + continue; + } } else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) { /* ignore backslashed spaces in %w */ --------------000303010108010007030107--