Dominik Bathon <dbatml / gmx.de> wrote:

> Hope that helps,

fine thanks a lot it works, you explained very well why the ruby version
works on string like : string="& BUT NOT no files because of
the \n..., here is a script able to compare perl output with ruby one :
def isFileUtf8Encoded(fileName)
  utf8rgx = /\A(
      [\x09\x0A\x0D\x20-\x7E]            # ASCII
    | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
    |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
    | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
    |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
    |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
    | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
    |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
  )*\z/x
  str=""
  File.open("#{fileName}").each { |l| str << l}
  return (utf8rgx === str)
end

p isFileUtf8Encoded("lutte-ouvriere.html") # => false
p isFileUtf8Encoded("l_harmatan.html")     # => false
p isFileUtf8Encoded("tut_exceptions.html") # => false
p isFileUtf8Encoded("butf.rb")             # => true
p isFileUtf8Encoded("biso.rb")             # => false

p `perl IsUTF-8.pl "lutte-ouvriere.html"`  # => "0"
p `perl IsUTF-8.pl "l_harmatan.html"`      # => "0"
p `perl IsUTF-8.pl "tut_exceptions.html"`  # => "0"
p `perl IsUTF-8.pl "butf.rb"`              # => "1"
p `perl IsUTF-8.pl "biso.rb"`              # => "0"

p $KCODE                                   # => "UTF8"

the perl script being (called from the ruby one) :

#!/usr/bin/perl

sub isFileUtf8Encoded
{
        my ($fn) = @_;
        $string='';
        open (F, $fn) || die "Unable to open file $file : $!";
        while ($line = <F>) {
                $string.=$line;
        }
        close F;
        $flag = ($string =~
          m/^(
             [\x09\x0A\x0D\x20-\x7E]            # ASCII
           | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
           |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
           | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
           |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
           |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
           | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
           |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
          )*$/x);
                if( $flag != 1 )
                {
                   return 0;
                }
        return $flag;
}
print isFileUtf8Encoded(@ARGV[0])


-- 
une bue