On Jan 29, 2006, at 10:23 AM, James Edward Gray II wrote:

> This is my port of File::ReadBackwards.

I haven't had time to document it yet, but here is the other port of  
WWW::RobotRules.  You use it something like this:

#!/usr/local/bin/ruby -w

require "robot_rules"
require "open-uri"

rules      = RobotRules.new("RubyQuizBrowser 1.0")
robots_url = "http://pragmaticprogrammer.com/robots.txt"

open(robots_url) do |url|
   data = url.read

   puts "/robots.txt:"
   puts data
   puts

   rules.parse(robots_url, data)
end

puts "URL tests:"
%w{ http://pragmaticprogrammer.com/images/dave.jpg
     http://pragmaticprogrammer.com/imagination }.each do |test|
   puts "rules.allowed?( #{test.inspect} )"
   puts rules.allowed?(test)
end

__END__

Which prints:

/robots.txt:
User-agent:  *
Disallow:    images

URL tests:
rules.allowed?( "http://pragmaticprogrammer.com/images/dave.jpg" )
false
rules.allowed?( "http://pragmaticprogrammer.com/imagination" )
true

James Edward Gray II

#!/usr/local/bin/ruby -w

# robot_rules.rb
#
#  Created by James Edward Gray II on 2006-01-31.
#  Copyright 2006 Gray Productions. All rights reserved.

require "uri"

# Based on Perl's WWW::RobotRules module, by Gisle Aas.
class RobotRules
   def initialize( user_agent )
     @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},  
"").downcase
     @rules      = Hash.new { |rules, rule| rules[rule] = Array.new }
   end

   def parse( text_uri, robots_data )
     uri      = URI.parse(text_uri)
     location = "#{uri.host}:#{uri.port}"
     @rules.delete(location)

     rules      = robots_data.split(/[\015\012]+/).
                              map { |rule| rule.sub(/\s*#.*$/, "") }
     anon_rules = Array.new
     my_rules   = Array.new
     current    = anon_rules
     rules.each do |rule|
       case rule
       when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
         break unless my_rules.empty?

         current = if $1 == "*"
           anon_rules
         elsif $1.downcase.index(@user_agent)
           my_rules
         else
           nil
         end
       when /^\s*Disallow\s*:\s*(.*?)\s*$/i
         next if current.nil?

         if $1.empty?
           current << nil
         else
           disallow = URI.parse($1)

           next unless disallow.scheme.nil? or disallow.scheme ==  
uri.scheme
           next unless disallow.port.nil?   or disallow.port == uri.port
           next unless disallow.host.nil?   or
                       disallow.host.downcase == uri.host.downcase

           disallow = disallow.path
           disallow = "/"            if disallow.empty?
           disallow = "/#{disallow}" unless disallow[0] == ?/

           current << disallow
         end
       end
     end

     @rules[location] = if my_rules.empty?
       anon_rules.compact
     else
       my_rules.compact
     end
   end

   def allowed?( text_uri )
     uri      = URI.parse(text_uri)
     location = "#{uri.host}:#{uri.port}"
     path     = uri.path

     return true unless %w{http https}.include?(uri.scheme)

     not @rules[location].any? { |rule| path.index(rule) == 0 }
   end
end