Dominik Bathon <dbatml / gmx.de> wrote:
> Hope that helps,
fine thanks a lot it works, you explained very well why the ruby version
works on string like : string="&éçàôûîêäë¢ BUT NOT no files because of
the \n..., here is a script able to compare perl output with ruby one :
def isFileUtf8Encoded(fileName)
utf8rgx = /\A(
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*\z/x
str=""
File.open("#{fileName}").each { |l| str << l}
return (utf8rgx === str)
end
p isFileUtf8Encoded("lutte-ouvriere.html") # => false
p isFileUtf8Encoded("l_harmatan.html") # => false
p isFileUtf8Encoded("tut_exceptions.html") # => false
p isFileUtf8Encoded("butf.rb") # => true
p isFileUtf8Encoded("biso.rb") # => false
p `perl IsUTF-8.pl "lutte-ouvriere.html"` # => "0"
p `perl IsUTF-8.pl "l_harmatan.html"` # => "0"
p `perl IsUTF-8.pl "tut_exceptions.html"` # => "0"
p `perl IsUTF-8.pl "butf.rb"` # => "1"
p `perl IsUTF-8.pl "biso.rb"` # => "0"
p $KCODE # => "UTF8"
the perl script being (called from the ruby one) :
#!/usr/bin/perl
sub isFileUtf8Encoded
{
my ($fn) = @_;
$string='';
open (F, $fn) || die "Unable to open file $file : $!";
while ($line = <F>) {
$string.=$line;
}
close F;
$flag = ($string =~
m/^(
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*$/x);
if( $flag != 1 )
{
return 0;
}
return $flag;
}
print isFileUtf8Encoded(@ARGV[0])
--
une bñ×ue