Issue #2567 has been updated by Yui NARUSE.
I don't decide whether merge this or not yet, an experimental patch is following:
diff --git a/lib/net/http.rb b/lib/net/http.rb
index 1c594e0..0abcaa5 100644
--- a/lib/net/http.rb
+++ b/lib/net/http.rb
@@ -2723,6 +2723,8 @@ module Net #:nodoc:
end
@read = true
+ enc = detect_encoding(@body)
+ @body.force_encoding(enc) if enc
@body
end
@@ -2807,6 +2809,167 @@ module Net #:nodoc:
end
end
+ private
+ # :nodoc:
+ def detect_encoding(str, encoding=nil)
+ if encoding
+ elsif encoding = type_params['charset']
+ elsif encoding = check_bom(str)
+ else
+ case main_type.downcase
+ when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml}
+ /\A<xml[ \t\r\n]+
+ version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+
+ encoding[ \t\r\n]*=[ \t\r\n]*
+ (?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str
+ encoding = $1 || $2 || Encoding::UTF_8
+ when %r{text/html.*}
+ sniff_encoding(str, encoding=nil)
+ end
+ end
+ return encoding
+ end
+
+ # :nodoc:
+ def sniff_encoding(str, encoding=nil)
+ # the encoding sniffing algorithm
+ # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
+ return enc if enc = scanning_meta(str)
+ # 6. last visited page or something
+ # 7. frequency
+ if str.ascii_only?
+ return Encoding::US_ASCII
+ else
+ utf8str = str.dup.force_encoding(Encoding::UTF_8)
+ return utf8str if utf8str.valid_encoding?
+ end
+ # 8. implementation-defined or user-specified
+ end
+
+ # :nodoc:
+ def check_bom(str)
+ case str.byteslice(0, 2)
+ when "\xFE\xFF"
+ return Encoding::UTF_16BE
+ when "\xFF\xFE"
+ return Encoding::UTF_16LE
+ end
+ if "\xEF\xBB\xBF" == str.byteslice(0, 3)
+ return Encoding::UTF_8
+ end
+ nil
+ end
+
+ # :nodoc:
+ def scanning_meta(str)
+ require 'strscan'
+ ss = StringScanner.new(str)
+ while true
+ if ss.skip(/<!--.*?-->/)
+ elsif ss.skip(/meta[\t\n\f\r ]*/)
+ attrs = {} # attribute_list
+ got_pragma = false
+ need_pragma = nil
+ charset = nil
+
+ # step: Attributes
+ while attr = get_attribute(ss)
+ name, value = *attr
+ next if attrs[name]
+ attrs[name] = true
+ case name
+ when 'http-equev'
+ got_pragma = true if value == 'content-type'
+ when 'content'
+ encoding = extracting_encodings_from_meta_elements(value)
+ unless charset
+ charset = encoding
+ end
+ need_pragma = true
+ when 'charset'
+ need_pragma = false
+ charset = value
+ end
+ end
+
+ # step: Processing
+ next if need_pragma.nil?
+ next if need_pragma && !got_pragma
+ charset = Encoding.find(charset) rescue nil
+ next unless charset
+ charset = Encoding::UTF_8 if charset == Encoding::UTF_16
+ return charset # tentative
+ elsif ss.skip(/<\/?[A-Za-z][^\t\n\f\r ]*/)
+ 1 while get_attribute(ss)
+ elsif ss.skip(/<[!\/?][^>]*>/)
+ elsif ss.getch
+ end
+ end
+ nil
+ end
+
+ def get_attribute(ss)
+ ss.scan(/[\t\n\f\r \/]*/)
+ if ss.peek(1) == '>'
+ ss.getch
+ return nil
+ end
+ name = ss.scan(/[^=\t\n\f\r \/>]*/)
+ name.downcase!
+ raise if name.empty?
+ ss.skip(/[\t\n\f\r ]*/)
+ if ss.getch != '='
+ value = ''
+ return [name, value]
+ end
+ ss.skip(/[\t\n\f\r ]*/)
+ case ss.peek(1)
+ when '"'
+ ss.getch
+ value = ss.scan(/[^"]+/)
+ value.downcase!
+ ss.getch
+ when "'"
+ ss.getch
+ value = ss.scan(/[^']+/)
+ value.downcase!
+ ss.getch
+ when '>'
+ value = ''
+ else
+ value = ss.scan(/[^\t\n\f\r >]+/)
+ value.downcase!
+ end
+ [name, value]
+ end
+
+ def extracting_encodings_from_meta_elements(value)
+ # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
+ if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value
+ return $1 || $2 || $3
+ end
+ return nil
+ end
+
+ # http://dev.w3.org/html5/spec/parsing.html#table-encoding-overrides
+ TABLE_ENCODING_OVERRIDES = {
+ 'EUC-KR' => Encoding::CP949,
+ 'EUC-JP' => Encoding::CP51932,
+ 'GB2312' => Encoding::GBK,
+ 'GB_2312-80' => Encoding::GBK,
+ 'ISO-8859-1' => Encoding::Windows_1252,
+ 'ISO-8859-9' => Encoding::Windows_1254,
+ 'ISO-8859-11' => Encoding::Windows_874,
+ 'KS_C_5601-1987' => Encoding::CP949,
+ 'SHIFT_JIS' => Encoding::Windows_31J,
+ 'TIS-620' => Encoding::Windows_874,
+ 'US-ASCII' => Encoding::Windows_1252,
+ }
+
+ # :nodoc:
+ def override_encoding(enc)
+ TABLE_ENCODING_OVERRIDES[enc.strip.upcase] || enc
+ end
end
----------------------------------------
Feature #2567: Net::HTTP does not handle encoding correctly
http://redmine.ruby-lang.org/issues/2567
Author: Ryan Sims
Status: Assigned
Priority: Low
Assignee: Yui NARUSE
Category: lib
Target version: 2.0.0
ruby -v: ruby 1.9.1p376 (2009-12-07 revision 26041) [i686-linux]
=begin
A string returned by an HTTP get does not have its encoding set appropriately with the charset field, nor does the content_type report the charset. Example code demonstrating incorrect behavior is below.
#!/usr/bin/ruby -w
# encoding: UTF-8
require 'net/http'
uri = URI.parse('http://www.hearya.com/feed/')
result = Net::HTTP.start(uri.host, uri.port) {|http|
http.get(uri.request_uri)
}
p result['content-type'] # "text/xml; charset=UTF-8" <- correct
p result.content_type # "text/xml" <- incorrect; truncates the charset field
puts result.body.encoding # ASCII-8BIT <- incorrect encoding, should be UTF-8
=end
--
http://redmine.ruby-lang.org