In article <87wsrz7wpg.fsf / fsij.org>,
  Tanaka Akira <akr / fsij.org> writes:

> 文字列がそのエンコーディングとして正しいかどうかを確認する機
> 能とか。

とうのを実装するには Oniguruma レベルの mbclen あたりで間違っ
たエンコーディングをまともに扱って、その情報を提供させないと
いけないわけですが、やってみるとこんな感じですかね。

とりあえず、その情報を使って String#inspect を変なバイトをちゃ
んと判別してエスケープするようにしてみました。

% ./ruby -e 'p "\xa1x".force_encoding("euc-jp")'|cat -v
"M-!x"

というように文字になってない 8bit バイトが出てくるのが

% ./ruby -e 'p "\xa1x".force_encoding("euc-jp")'|cat -v
"\241x"

というようにエスケープされるようになります。

他にも

% ./ruby -e 'p "\374".force_encoding("utf-8")'
"\000"

というように、嘘つけといいたくなるようなのが

% ./ruby -e 'p "\374".force_encoding("utf-8")'
"\374"

と出るようになります。

Index: encoding.c
===================================================================
--- encoding.c	(revision 14084)
+++ encoding.c	(working copy)
@@ -495,6 +495,12 @@ rb_enc_mbclen(const char *p, const char 
 }
 
 int
+rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
+{
+    return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
+}
+
+int
 rb_enc_codelen(int c, rb_encoding *enc)
 {
     int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
Index: include/ruby/encoding.h
===================================================================
--- include/ruby/encoding.h	(revision 14084)
+++ include/ruby/encoding.h	(working copy)
@@ -71,6 +71,12 @@ rb_encoding * rb_enc_find(const char *na
 /* ptr,encoding -> mbclen */
 int rb_enc_mbclen(const char*, const char *, rb_encoding*);
 
+/* ptr,encoding -> mbclen, invalid or needmore */
+int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*);
+#define MBCLEN_CHARFOUND(ret)     ONIGENC_MBCLEN_CHARFOUND(ret)
+#define MBCLEN_INVALID(ret)       ONIGENC_MBCLEN_INVALID(ret)
+#define MBCLEN_NEEDMORE(ret)      ONIGENC_MBCLEN_NEEDMORE(ret)
+
 /* code,encoding -> codelen */
 int rb_enc_codelen(int, rb_encoding*);
 
Index: include/ruby/oniguruma.h
===================================================================
--- include/ruby/oniguruma.h	(revision 14084)
+++ include/ruby/oniguruma.h	(working copy)
@@ -144,7 +144,7 @@ typedef struct {
 typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, int to_len, void* arg);
 
 typedef struct OnigEncodingTypeST {
-  int    (*mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc);
+  int    (*precise_mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc);
   const char*   name;
   int           max_enc_len;
   int           min_enc_len;
@@ -282,7 +282,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodin
 #define ONIGENC_STEP_BACK(enc,start,s,n) \
         onigenc_step_back((enc),(start),(s),(n))
 
-#define ONIGENC_MBC_ENC_LEN(enc,p,e)           (enc)->mbc_enc_len(p,e,enc)
+
+#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n)   (n)
+#define ONIGENC_CONSTRUCT_MBCLEN_INVALID()      (-1)
+#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n)    (-1-n)
+
+static inline int onigenc_mbclen_charfound(int r) { return 0 < r ? r : 0; }
+static inline int onigenc_mbclen_needmore(int r) { return r < -1 ? -1 - r : 0; }
+#define ONIGENC_MBCLEN_CHARFOUND(r)     onigenc_mbclen_charfound(r)
+#define ONIGENC_MBCLEN_INVALID(r)       ((r) == -1)
+#define ONIGENC_MBCLEN_NEEDMORE(r)      onigenc_mbclen_needmore(r)
+
+#define ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e)   (enc)->precise_mbc_enc_len(p,e,enc)
+
+static inline int onigenc_mbclen_recover(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc)
+{
+    int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e);
+    int r;
+    if (ONIGENC_MBCLEN_INVALID(ret))
+        return 1;
+    else if ((r = ONIGENC_MBCLEN_NEEDMORE(ret)))
+        return e-p+r;
+    else
+        return ONIGENC_MBCLEN_CHARFOUND(ret);
+}
+
+#define ONIGENC_MBC_ENC_LEN(enc,p,e)           onigenc_mbclen_recover(p,e,enc)
 #define ONIGENC_MBC_MAXLEN(enc)               ((enc)->max_enc_len)
 #define ONIGENC_MBC_MAXLEN_DIST(enc)           ONIGENC_MBC_MAXLEN(enc)
 #define ONIGENC_MBC_MINLEN(enc)               ((enc)->min_enc_len)
Index: enc/euc_jp.c
===================================================================
--- enc/euc_jp.c	(revision 14084)
+++ enc/euc_jp.c	(working copy)
@@ -50,10 +50,85 @@ static const int EncLen_EUCJP[] = {
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
 };
 
+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
+  { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
+    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 
+  },
+  { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F 
+  },
+  { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 
+  },
+
+};
+#undef A
+#undef F
+
 static int
 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
 {
-  return EncLen_EUCJP[*p];
+  int firstbyte = *p++;
+  state_t s;
+  s = trans[0][firstbyte];
+  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
+                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
+  s = trans[s][*p++];
+  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
+                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
+  s = trans[s][*p++];
+  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
+                       ONIGENC_CONSTRUCT_MBCLEN_INVALID();
 }
 
 static OnigCodePoint
Index: enc/utf8.c
===================================================================
--- enc/utf8.c	(revision 14084)
+++ enc/utf8.c	(working copy)
@@ -59,10 +59,155 @@ static const int EncLen_UTF8[] = {
   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
 };
 
+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3, S4, S5 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
+  { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* e */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    /* f */ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, F, F 
+  },
+  { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
+  },
+  { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
+  },
+  { /* S3   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 8 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    /* 9 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    /* a */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    /* b */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
+  },
+  { /* S4   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
+  },
+  { /* S5   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 8 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    /* 9 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    /* a */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    /* b */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
+  }
+};
+#undef A
+#undef F
+
 static int
 utf8_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
 {
-  return EncLen_UTF8[*p];
+  int firstbyte = *p++;
+  state_t s;
+  s = trans[0][firstbyte];
+  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
+                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+
+  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
+  s = trans[s][*p++];
+  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
+                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+
+  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
+  s = trans[s][*p++];
+  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
+                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+
+  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
+  s = trans[s][*p++];
+  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
+                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+
+  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-4);
+  s = trans[s][*p++];
+  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(5) :
+                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+
+  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-5);
+  s = trans[s][*p++];
+  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(6) :
+                       ONIGENC_CONSTRUCT_MBCLEN_INVALID();
 }
 
 static int
Index: enc/sjis.c
===================================================================
--- enc/sjis.c	(revision 14084)
+++ enc/sjis.c	(working copy)
@@ -70,10 +70,62 @@ static const char SJIS_CAN_BE_TRAIL_TABL
 #define SJIS_ISMB_FIRST(byte)  (EncLen_SJIS[byte] > 1)
 #define SJIS_ISMB_TRAIL(byte)  SJIS_CAN_BE_TRAIL_TABLE[(byte)]
 
+typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
+#define A ACCEPT
+#define F FAILURE
+static const signed char trans[][0x100] = {
+  { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
+  },
+  { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
+    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
+    /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
+    /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
+    /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
+  }
+};
+#undef A
+#undef F
+
 static int
 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
 {
-  return EncLen_SJIS[*p];
+  int firstbyte = *p++;
+  state_t s;
+  s = trans[0][firstbyte];
+  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
+                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
+  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
+  s = trans[s][*p++];
+  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
+                       ONIGENC_CONSTRUCT_MBCLEN_INVALID();
 }
 
 static int
Index: string.c
===================================================================
--- string.c	(revision 14084)
+++ string.c	(working copy)
@@ -2919,10 +2919,19 @@ rb_str_inspect(VALUE str)
     str_cat_char(result, '"', enc);
     p = RSTRING_PTR(str); pend = RSTRING_END(str);
     while (p < pend) {
-	int c = rb_enc_codepoint(p, pend, enc);
-	int n = rb_enc_codelen(c, enc);
+	int c;
+	int n;
 	int cc;
 
+        n = rb_enc_precise_mbclen(p, pend, enc);
+        if (!MBCLEN_CHARFOUND(n)) {
+            c = (unsigned char)*p++;
+            goto escape_byte;
+        }
+
+	c = rb_enc_codepoint(p, pend, enc);
+	n = rb_enc_codelen(c, enc);
+
 	p += n;
 	if (c == '"'|| c == '\\' ||
 	    (c == '#' && (cc = rb_enc_codepoint(p,pend,enc),
@@ -2961,8 +2970,10 @@ rb_str_inspect(VALUE str)
 	}
 	else {
 	    char buf[5];
-	    char *s = buf;
+	    char *s;
 
+escape_byte:
+	    s = buf;
 	    sprintf(buf, "\\%03o", c & 0377);
 	    while (*s) {
 		str_cat_char(result, *s++, enc);
Index: test/ruby/test_m17n.rb
===================================================================
--- test/ruby/test_m17n.rb	(revision 14084)
+++ test/ruby/test_m17n.rb	(working copy)
@@ -36,6 +36,38 @@ class TestM17N < Test::Unit::TestCase
     assert_nothing_raised { eval(u(%{"\\u{6666}\xc0\xa0"})) }
   end
 
+  def test_string_inspect
+    assert_equal('"\376"', e("\xfe").inspect)
+    assert_equal('"\216"', e("\x8e").inspect)
+    assert_equal('"\217"', e("\x8f").inspect)
+    assert_equal('"\217\241"', e("\x8f\xa1").inspect)
+    assert_equal('"\357"', s("\xef").inspect)
+    assert_equal('"\300"', u("\xc0").inspect)
+    assert_equal('"\340\200"', u("\xe0\x80").inspect)
+    assert_equal('"\360\200\200"', u("\xf0\x80\x80").inspect)
+    assert_equal('"\370\200\200\200"', u("\xf8\x80\x80\x80").inspect)
+    assert_equal('"\374\200\200\200\200"', u("\xfc\x80\x80\x80\x80").inspect)
+
+    assert_equal('"\376 "', e("\xfe ").inspect)
+    assert_equal('"\216 "', e("\x8e ").inspect)
+    assert_equal('"\217 "', e("\x8f ").inspect)
+    assert_equal('"\217\241 "', e("\x8f\xa1 ").inspect)
+    assert_equal('"\357 "', s("\xef ").inspect)
+    assert_equal('"\300 "', u("\xc0 ").inspect)
+    assert_equal('"\340\200 "', u("\xe0\x80 ").inspect)
+    assert_equal('"\360\200\200 "', u("\xf0\x80\x80 ").inspect)
+    assert_equal('"\370\200\200\200 "', u("\xf8\x80\x80\x80 ").inspect)
+    assert_equal('"\374\200\200\200\200 "', u("\xfc\x80\x80\x80\x80 ").inspect)
+
+
+    assert_equal(e("\"\\241\x8f\xa1\xa1\""), e("\xa1\x8f\xa1\xa1").inspect)
+
+    assert_equal('"\201."', s("\x81.").inspect)
+    assert_equal(s("\"\x81@\""), s("\x81@").inspect)
+
+    assert_equal('"\374"', u("\xfc").inspect)
+  end
+
   def test_regexp_too_short_multibyte_character
     assert_raise(SyntaxError) { eval('/\xfe/e') }
     assert_raise(SyntaxError) { eval('/\x8e/e') }
-- 
[田中 哲][たなか あきら][Tanaka Akira]