なかだです。

At Wed, 27 Mar 2002 13:21:42 +0900,
Yukihiro Matsumoto wrote:
> |\x{}では、コードポイントを有効なバイト表現にエンコードする、ということ
> |でしょうか。
> 
> そういうことです。

うむむ。めんどくさっ。


Index: parse.y =================================================================== RCS file: /cvs/ruby/src/ruby/parse.y,v retrieving revision 1.164 diff -u -2 -p -r1.164 parse.y --- parse.y 2002/03/26 06:18:49 1.164 +++ parse.y 2002/03/28 13:34:52 @@ -2364,4 +2364,14 @@ tokadd(c) } +static void +tokspace(n) + int n; +{ + if ((tokidx += n) > toksiz) { + while ((toksiz *= 2) < tokidx); + REALLOC_N(tokenbuf, char, toksiz); + } +} + static int read_escape() @@ -2494,5 +2504,16 @@ tokadd_escape(term) tokadd('\\'); tokadd(c); - scan_hex(lex_p, 2, &numlen); + if ((c = nextc()) == '{') { + char *p = memchr(lex_p, '}', lex_pend - lex_p); + if (!p || (scan_hex(lex_p, p - lex_p, &numlen), numlen != p - lex_p)) { + yyerror("Invalid hexadecimal character syntax"); + return '\0'; + } + tokadd(c); + ++numlen; + } + else { + scan_hex(lex_p, 2, &numlen); + } while (numlen--) tokadd(nextc()); @@ -2706,7 +2727,20 @@ parse_string(func, term, paren) } else { - pushback(c); if (func != '"') tokadd('\\'); - tokadd(read_escape()); + if (c == 'x' && peek('{')) { + int numlen; + char *p; + nextc(); + p = memchr(lex_p, '}', lex_pend -lex_p); + if (!p) goto unterm_str; + c = scan_hex(lex_p, p - lex_p, &numlen); + if (numlen != lex_pend - lex_p) goto unterm_str; + tokspace(wclen(c)); + tokidx += wc2mbs(c, &tokenbuf[tokidx]); + } + else { + pushback(c); + tokadd(read_escape()); + } } continue; Index: regex.c =================================================================== RCS file: /cvs/ruby/src/ruby/regex.c,v retrieving revision 1.67 diff -u -2 -p -r1.67 regex.c --- regex.c 2002/03/25 09:08:15 1.67 +++ regex.c 2002/03/28 15:19:15 @@ -499,4 +499,61 @@ utf8_firstbyte(c) } +int wclen(c) + unsigned long c; +{ + switch (current_mbctype) { + case MBCTYPE_ASCII: + return 1; + case MBCTYPE_EUC: + if (c < 0x100) return 1; + if (c < 0x10000) return 2; + return 3; + case MBCTYPE_SJIS: + if (c < 0x100) return 1; + return 2; + case MBCTYPE_UTF8: + if (c < 0x80) return 1; + if (c <= 0x7ff) return 2; + if (c <= 0xffff) return 3; + if (c <= 0x1fffff) return 4; + if (c <= 0x3ffffff) return 5; + if (c <= 0x7fffffff) return 6; + } + return -1; +} + +int wc2mbs(c, s) + unsigned long c; + char *s; +{ + switch (current_mbctype) { + case MBCTYPE_ASCII: + *s++ = (char)c; + return 1; + case MBCTYPE_UTF8: + { + int l = mbclen(*s = utf8_firstbyte(c)); + int n = l; + while (--l > 0) { + s[l] = (c & 0x3f) | 0xc0; + } + return n; + } + break; + default: + { + char *start = s; + if (c >> BYTEWIDTH * 2) { + *s++ = (char)(c >> BYTEWIDTH * 2); + } + if (c >> BYTEWIDTH) { + *s++ = (char)(c >> BYTEWIDTH); + } + *s++ = (char)c; + return s - start; + } + } +} + static void print_mbc(c) @@ -1530,6 +1587,20 @@ re_compile_pattern(pattern, size, bufp) case 'x': - c = scan_hex(p, 2, &numlen); - p += numlen; + if (*p == '{') { + ++p; + nextp = memchr(p, '}', pend - p); + if (!nextp) goto end_of_pattern; + c = scan_hex(p, nextp - p, &numlen); + if (numlen != nextp - p) goto invalid_escape; + p = nextp + 1; + if (wclen(c) > 1) { + ++had_mbchar; + break; + } + } + else { + c = scan_hex(p, 2, &numlen); + p += numlen; + } had_num_literal = 1; break; @@ -2259,6 +2330,20 @@ re_compile_pattern(pattern, size, bufp) case 'x': had_mbchar = 0; - c = scan_hex(p, 2, &numlen); - p += numlen; + if (*p == '{') { + ++p; + nextp = memchr(p, '}', pend - p); + if (!nextp) goto end_of_pattern; + c = scan_hex(p, nextp - p, &numlen); + if (numlen != nextp - p) goto invalid_escape; + p = ++nextp; + if (wclen(c) > 1) { + had_mbchar = 2; + goto wide_char; + } + } + else { + c = scan_hex(p, 2, &numlen); + p += numlen; + } had_num_literal = 1; goto numeric_char; @@ -2341,4 +2426,5 @@ re_compile_pattern(pattern, size, bufp) numeric_char: nextp = p + mbclen(c) - 1; + wide_char: if (!pending_exact || pending_exact + *pending_exact + 1 != b || *pending_exact >= (c1 ? 0176 : 0177) @@ -2355,4 +2441,12 @@ re_compile_pattern(pattern, size, bufp) (*pending_exact)++; had_num_literal = 0; + } + if (had_mbchar == 2) { + int len = wclen(c); + GET_BUFFER_SPACE(len); + wc2mbs(c, b); + b += len; + *pending_exact += len; + continue; } BUFPUSH(c); Index: regex.h =================================================================== RCS file: /cvs/ruby/src/ruby/regex.h,v retrieving revision 1.15 diff -u -2 -p -r1.15 regex.h --- regex.h 2002/01/04 14:14:35 1.15 +++ regex.h 2002/03/28 13:35:00 @@ -40,4 +40,6 @@ # define re_set_casetable ruby_re_set_casetable # define register_info_type ruby_register_info_type +# define wclen ruby_wclen +# define wc2mbs ruby_wc2mbs #endif @@ -101,6 +103,10 @@ const unsigned char *re_mbctab; #if defined(__STDC__) void re_mbcinit (int); +int wclen (unsigned long); +int wc2mbs (unsigned long, char *); #else void re_mbcinit (); +int wclen (); +int wc2mbs (); #endif
-- --- 僕の前にBugはない。 --- 僕の後ろにBugはできる。 中田 伸悦