This is a multi-part message in MIME format.
--------------000303010108010007030107
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

Back at the end of August, Matz wrote (see 
http://www.ruby-forum.com/topic/123224#548617):
> 
> We just had a meeting to discuss about issues like this yesterday.
> And the end result was
> 
>   \xXX         -> single byte
>   \uXXXX       -> single Unicode character by codepoint (BMP)
>   \u{XXXXXXXX} -> single Unicode character up to 4 bytes

I've attached a patch to parse.y that implements these two forms of \u 
escapes within string literals.  Please treat this as proof-of-concept 
only. I hope it proves useful, but I am too unfamiliar with parse.y to 
know what I'm doing wrong here (and I'm sure I'm doing something).

Despite the fact that the u in "\u" stands for Unicode, I wrote this 
patch to work with whatever the parser->enc encoding was. If you run 
Ruby with -Ku, you get Unicode codepoints translated to UTF-8, as expected.

But with -Ks or -Ke, it takes whatever codepoint you specify and 
translates it to the appropriate multibyte sequence for SJIS or EUC. (I 
don't know how to test this, however.)  If you run the parser in ASCII 
mode, then \u1234 is equivalent to \x12\x34.  Similarly for the \u{} 
form.  I don't know if this is actually a good idea or not.   There is a 
strong case to be made that \u should always create utf-8 bytes 
regardless of the encoding the parser is using.

	David

--------------000303010108010007030107
Content-Type: text/plain;
 nameatch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filenameatch"

Index: parse.y
--- parse.y	(revision 13739)
+++ parse.y	(working copy)
@@ -4975,6 +4975,38 @@
 	if (mb && (c > x80)) *mb  NC_CODERANGE_UNKNOWN;
 	return c;
 
+      case 'u':	/* Unicode constant */
+	{
+	    int numlen;
+	    int close;
+
+	    if ((c  extc()) '{') {
+		c  can_hex(lex_p, 8, &numlen);
+		if (numlen < 2)	{
+		    yyerror("Invalid Unicode escape");
+		    return 0;
+		}
+		lex_p + umlen;
+
+		if ((close  extc()) ! }') {
+		    pushback(close);
+		    yyerror("Unterminated Unicode escape");
+		    return 0;
+		}
+	    }
+	    else {
+		pushback(c);
+		c  can_hex(lex_p, 4, &numlen);
+		if (numlen < 4) {
+		    yyerror("Invalid Unicode escape");
+		    return 0;
+		}
+		lex_p + umlen;
+	    }
+	}
+	if (mb && (c > x80)) *mb  NC_CODERANGE_UNKNOWN;
+	return c;
+
       case 'b':	/* backspace */
 	return '\010';
 
@@ -5068,6 +5100,47 @@
 	}
 	return 0;
 
+      case 'u':	/* Unicode constant */
+	{
+	    int numlen;
+	    int hex;
+
+	    tokadd('\\');
+	    tokadd(c);
+
+	    if ((c  extc()) '{') {
+		hex  can_hex(lex_p, 8, &numlen);
+		if (numlen < 2) {
+		    yyerror("Invalid Unicode escape");
+		    return -1;
+		}
+		tokadd('{');
+		while (numlen--)
+		    tokadd(nextc());
+		if ((c  extc()) '}')
+		    tokadd(c);
+		else {
+		    pushback(c);
+		    yyerror("Unterminated Unicode escape");
+		    return -1;
+		}
+	    }	    
+	    else {
+		pushback(c);
+		hex  can_hex(lex_p, 4, &numlen);
+		if (numlen < 4) {
+		    yyerror("Invalid Unicode escape");
+		    return -1;
+		}
+
+		while (numlen--)
+		    tokadd(nextc());
+	    }
+
+	    if (mb && (hex > x80)) *mb  NC_CODERANGE_UNKNOWN;
+	}
+	return 0;
+
       case 'M':
 	if ((c  extc()) ! -') {
 	    yyerror("Invalid escape character syntax");
@@ -5230,6 +5303,31 @@
 		    pushback(c);
 		    if (func & STR_FUNC_ESCAPE) tokadd('\\');
 		    c  ead_escape(mb);
+
+		    /*
+		     * The \u escape can return codepoints > 255.
+		     * Encode these using the current encoding.
+		     * If the encoding is a single-byte encoding, then
+		     * treat \u as a repeated \x and just encode the bytes
+		     */
+		    if (c > 255) {
+			int maxbytes  b_enc_mbmaxlen(parser->enc);
+			UChar buf[maxbytes];
+			if (maxbytes > 1) { 
+			    int i,n;
+			    n  b_enc_mbcput(c,buf,parser->enc);
+			    for(i i < n; i++) tokadd(buf[i]);
+			}
+			else {  
+			    /* in ASCII, \uWXYZ is just \xWX\xYZ */
+			    /* \u{12345678} is \x12\x34\x56\x78 */
+			    if (c & 0xff000000) tokadd(c >> 24);
+			    if (c & 0x00ff0000) tokadd((c >> 16) & 0xff);
+			    if (c & 0x0000ff00) tokadd((c >> 8) & 0xff);
+			    tokadd(c & 0xff);
+			}
+			continue;
+		    }
 		}
 		else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
 		    /* ignore backslashed spaces in %w */

--------------000303010108010007030107--