Issue #2567 has been updated by Yui NARUSE.


I don't decide whether merge this or not yet, an experimental patch is following:

diff --git a/lib/net/http.rb b/lib/net/http.rb
index 1c594e0..0abcaa5 100644
--- a/lib/net/http.rb
+++ b/lib/net/http.rb
@@ -2723,6 +2723,8 @@ module Net   #:nodoc:
       end
       @read = true
 
+      enc = detect_encoding(@body)
+      @body.force_encoding(enc) if enc
       @body
     end
 
@@ -2807,6 +2809,167 @@ module Net   #:nodoc:
       end
     end
 
+    private
+    # :nodoc:
+    def detect_encoding(str, encoding=nil)
+      if encoding
+      elsif encoding = type_params['charset']
+      elsif encoding = check_bom(str)
+      else
+        case main_type.downcase
+        when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml}
+          /\A<xml[ \t\r\n]+
+            version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+
+            encoding[ \t\r\n]*=[ \t\r\n]*
+            (?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str
+          encoding = $1 || $2 || Encoding::UTF_8
+        when %r{text/html.*}
+          sniff_encoding(str, encoding=nil)
+        end
+      end
+      return encoding
+    end
+
+    # :nodoc:
+    def sniff_encoding(str, encoding=nil)
+      # the encoding sniffing algorithm
+      # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
+      return enc if enc = scanning_meta(str)
+      # 6. last visited page or something
+      # 7. frequency
+      if str.ascii_only?
+        return Encoding::US_ASCII
+      else
+        utf8str = str.dup.force_encoding(Encoding::UTF_8)
+        return utf8str if utf8str.valid_encoding?
+      end
+      # 8. implementation-defined or user-specified
+    end
+
+    # :nodoc:
+    def check_bom(str)
+      case str.byteslice(0, 2)
+      when "\xFE\xFF"
+        return Encoding::UTF_16BE
+      when "\xFF\xFE"
+        return Encoding::UTF_16LE
+      end
+      if "\xEF\xBB\xBF" == str.byteslice(0, 3)
+        return Encoding::UTF_8
+      end
+      nil
+    end
+
+    # :nodoc:
+    def scanning_meta(str)
+      require 'strscan'
+      ss = StringScanner.new(str)
+      while true
+        if ss.skip(/<!--.*?-->/)
+        elsif ss.skip(/meta[\t\n\f\r ]*/)
+          attrs = {} # attribute_list
+          got_pragma = false
+          need_pragma = nil
+          charset = nil
+
+          # step: Attributes
+          while attr = get_attribute(ss)
+            name, value = *attr
+            next if attrs[name]
+            attrs[name] = true
+            case name
+            when 'http-equev'
+              got_pragma = true if value == 'content-type'
+            when 'content'
+              encoding = extracting_encodings_from_meta_elements(value)
+              unless charset
+                charset = encoding
+              end
+              need_pragma = true
+            when 'charset'
+              need_pragma = false
+              charset = value
+            end
+          end
+
+          # step: Processing
+          next if need_pragma.nil?
+          next if need_pragma && !got_pragma
+          charset = Encoding.find(charset) rescue nil
+          next unless charset
+          charset = Encoding::UTF_8 if charset == Encoding::UTF_16
+          return charset # tentative
+        elsif ss.skip(/<\/?[A-Za-z][^\t\n\f\r ]*/)
+          1 while get_attribute(ss)
+        elsif ss.skip(/<[!\/?][^>]*>/)
+        elsif ss.getch
+        end
+      end
+      nil
+    end
+
+    def get_attribute(ss)
+      ss.scan(/[\t\n\f\r \/]*/)
+      if ss.peek(1) == '>'
+        ss.getch
+        return nil
+      end
+      name = ss.scan(/[^=\t\n\f\r \/>]*/)
+      name.downcase!
+      raise if name.empty?
+      ss.skip(/[\t\n\f\r ]*/)
+      if ss.getch != '='
+        value = ''
+        return [name, value]
+      end
+      ss.skip(/[\t\n\f\r ]*/)
+      case ss.peek(1)
+      when '"'
+        ss.getch
+        value = ss.scan(/[^"]+/)
+        value.downcase!
+        ss.getch
+      when "'"
+        ss.getch
+        value = ss.scan(/[^']+/)
+        value.downcase!
+        ss.getch
+      when '>'
+        value = ''
+      else
+        value = ss.scan(/[^\t\n\f\r >]+/)
+        value.downcase!
+      end
+      [name, value]
+    end
+
+    def extracting_encodings_from_meta_elements(value)
+      # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
+      if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value
+        return $1 || $2 || $3
+      end
+      return nil
+    end
+
+    # http://dev.w3.org/html5/spec/parsing.html#table-encoding-overrides
+    TABLE_ENCODING_OVERRIDES = {
+      'EUC-KR'         => Encoding::CP949,
+      'EUC-JP'         => Encoding::CP51932,
+      'GB2312'         => Encoding::GBK,
+      'GB_2312-80'     => Encoding::GBK,
+      'ISO-8859-1'     => Encoding::Windows_1252,
+      'ISO-8859-9'     => Encoding::Windows_1254,
+      'ISO-8859-11'    => Encoding::Windows_874,
+      'KS_C_5601-1987' => Encoding::CP949,
+      'SHIFT_JIS'      => Encoding::Windows_31J,
+      'TIS-620'        => Encoding::Windows_874,
+      'US-ASCII'       => Encoding::Windows_1252,
+    }
+
+    # :nodoc:
+    def override_encoding(enc)
+      TABLE_ENCODING_OVERRIDES[enc.strip.upcase] || enc
+    end
   end
 
 
----------------------------------------
Feature #2567: Net::HTTP does not handle encoding correctly
http://redmine.ruby-lang.org/issues/2567

Author: Ryan Sims
Status: Assigned
Priority: Low
Assignee: Yui NARUSE
Category: lib
Target version: 2.0.0
ruby -v: ruby 1.9.1p376 (2009-12-07 revision 26041) [i686-linux]


=begin
 A string returned by an HTTP get does not have its encoding set appropriately with the charset field, nor does the content_type report the charset. Example code demonstrating incorrect behavior is below.
 
 #!/usr/bin/ruby -w
 # encoding: UTF-8
 
 require 'net/http'
 
 uri = URI.parse('http://www.hearya.com/feed/')
 result = Net::HTTP.start(uri.host, uri.port) {|http|
     http.get(uri.request_uri)
 }
 
 p result['content-type']     # "text/xml; charset=UTF-8" <- correct
 p result.content_type        # "text/xml" <- incorrect; truncates the charset field
 puts result.body.encoding    # ASCII-8BIT <- incorrect encoding, should be UTF-8
=end



-- 
http://redmine.ruby-lang.org