When I think of creating an index in Ruby I think of a Hash. So I
decided to code both and see how they compare. Here are my
non-bitmapped and bitmapped solutions:
class IndexHash
attr_accessor :index
def initialize( documents=nil )
@index = Hash.new( [] )
input( documents ) if documents
end
def input( documents )
documents.each_pair do |symbol, contents|
contents.split.each { |word| insert( symbol, word) }
end
end
def insert( document_symbol, word )
@index[word.downcase] = [] unless @index.has_key?( word.downcase )
@index[word.downcase].push( document_symbol ) unless
@index[word.downcase].include?( document_symbol )
end
def find( word )
@index[ word.downcase ]
end
def words
@index.keys.sort
end
end
class IndexBitmap
attr_accessor :index
def initialize( documents=nil )
@index = []
@documents = {}
input( documents ) if documents
end
def input( documents )
documents.each_pair do |symbol, contents|
contents.split.each { |word| insert( symbol, word) }
end
end
def insert( document_symbol, word )
@index.push( word.downcase ) unless @index.include?( word.downcase )
@documents[ document_symbol ] = 0 unless @documents.has_key?(
document_symbol )
@documents[ document_symbol ] += (1<<@index.index( word.downcase ))
end
def find( word )
result = []
@documents.each_pair do |symbol, value|
result.push( symbol ) if value & (1<<@index.index( word.downcase ))
end
result
end
def words
@index.sort
end
end
To verify this, I used the following tests. I just had to change which
class was being tested (@test_class defined in 'setup'):
require 'test/unit'
require 'index'
class Array
# Contents of the two arrays are the same, but the order may be
different
def equivalent(other)
self.each do |item|
if !other.include?( item ) then
return false
end
end
return true
end
end
DOC1 = "The quick brown fox"
INDEX1 = [ 'the', 'quick', 'brown', 'fox' ]
DOC2 = "Jumped over the brown dog"
INDEX2 = [ 'jumped', 'over', 'the', 'brown', 'dog' ]
DOC3 = "Cut him to the quick"
INDEX3 = [ 'cut', 'him', 'to', 'the', 'quick' ]
class TestIndex < Test::Unit::TestCase
def setup
@test_class = IndexBitmap
@i = @test_class.new
end
def test_index_single_document
@i.input( :doc1=>DOC1 )
assert_equal( INDEX1.sort, @i.words )
end
def test_index_muliple_documents_input_one_at_a_time
@i.input( :doc1=>DOC1 )
@i.input( :doc2=>DOC2 )
@i.input( :doc3=>DOC3 )
assert_equal( (INDEX1+INDEX2+INDEX3).uniq.sort, @i.words )
end
def test_index_muliple_documents_input_all_at_one_time
@i.input( :doc1=>DOC1, :doc2=>DOC2, :doc3=>DOC3 )
assert_equal( (INDEX1+INDEX2+INDEX3).uniq.sort, @i.words )
end
def test_index_single_document_on_new
j = @test_class.new( :doc1=>DOC1 )
assert_equal( INDEX1.sort, j.words )
end
def test_index_muliple_documents_input_all_at_one_time_on_new
j = @test_class.new( :doc1=>DOC1, :doc2=>DOC2, :doc3=>DOC3 )
assert_equal( (INDEX1+INDEX2+INDEX3).uniq.sort, j.words )
end
def test_index_find
@i.input( :doc1=>DOC1, :doc2=>DOC2, :doc3=>DOC3 )
assert_equal( true, [:doc1,:doc2,:doc3].equivalent( @i.find( 'the' )
) )
assert_equal( true, [:doc1,:doc3].equivalent( @i.find( 'quick' ) ) )
assert_equal( true, [:doc2].equivalent( @i.find( 'dog' ) ) )
assert_equal( true, [:doc1,:doc2].equivalent( @i.find( 'brown' ) ) )
end
end