Here's my solutions. I wrote this 2.5 times. The first version
worked, but it was slow as molasses. The second one is much much
faster. The 2.5th inherits from the second one and produces more
readable, complete sentences
first, slow version:
% cat markov.rb
class WordTable
def initialize(src, separator = /\W+/, word_shape = /\w+/)
@src = src
@separator = separator
@word_shape = word_shape
end
def build_table
@table = Hash.new { |h, k| h[k] = [] }
pos = 0
while line = @src.gets
line = line.chomp.downcase
words = line.split(@separator).select { |potential_word|
potential_word.match(@word_shape)
}
words.each do |word|
@table[word] << pos
pos += 1
end
end
self
end
def words
@table.keys
end
def positions_for(word)
@table[word]
end
def followers_for(word)
positions = positions_for(word)
followers_positions = positions.map { |i| i + 1 }
following_words = self.words.select { |key_word|
followers_positions.any? { |pos| positions_for
(key_word).include?(pos) }
}
following_words
end
end
class ChainBuilder
attr_accessor :chain_length
def initialize(initial_word, word_table, chain_length = 10)
@initial_word = initial_word
@word_table = word_table
@chain_length = chain_length
end
def distributed_followers(word)
distd_followers = []
followers = @word_table.followers_for(word)
positions_of_word = @word_table.positions_for(word)
followers.each do |follower|
follower_positions = @word_table.positions_for(follower)
count = follower_positions.select { |pos|
prec = pos - 1
positions_of_word.include?(prec)
}.length
distd_followers += [follower] * count
end
distd_followers
end
def randomized_next_word(word)
choices = distributed_followers(word)
choices[ rand(choices.length) ]
end
def chain
res_chain = [@initial_word]
(self.chain_length - 1).times do
res_chain << randomized_next_word(res_chain.last)
end
res_chain
end
end
if $0 == __FILE__
if ARGV[0].nil?
STDERR.puts "Please provide a corpus."
STDERR.puts "#$0 usage: #$0 <corpus file name> [chain length]
[initial word]"
exit 1
end
chain_len = (ARGV[1] || 10).to_i
wt = nil
File.open(ARGV[0]) do |file|
#wt = WordTable.new(file, //, /./) # try by characters
wt = WordTable.new(file)
wt.build_table
end
init_word = (ARGV[2] || wt.words[ rand( wt.words.length ) ] )
chain_builder = ChainBuilder.new(init_word, wt, chain_len)
chain = chain_builder.chain
puts chain.join(' ').capitalize
end
Second, quicker version:
% cat markov2.rb
class MarkovChainBuilder
class Entry < Struct.new(:word_index, :successors)
end
def initialize(src, separator = /\W+/, word_shape = /\w+/,
downcase = true)
@src = src
@separator = separator
@word_shape = word_shape
@downcase = downcase
build_tables
end
def build_tables
@word_list = []
@successor_lists = {}
last_word = nil
@src.each do |line|
line = line.downcase if @downcase
line = line.chomp
words_for_line = line.split(@separator).select{ |w| w.match
(@word_shape)}
words_for_line.each do |word|
unless @successor_lists.has_key? word
entry = @successor_lists[word] = Entry.new
@word_list << word
entry.word_index = @word_list.length - 1
entry.successors = []
end
unless last_word.nil?
@successor_lists[last_word].successors << @successor_lists
[word].word_index
end
last_word = word
end
end
end
def distributed_successors_for(word)
@successor_lists[word].successors.map { |index| @word_list[index] }
end
def randomized_next_for(word)
succs = distributed_successors_for(word)
succs[ rand(succs.length) ]
end
def markov_chain(initial_word, len = 10)
res = [initial_word]
(len - 1).times do
res << randomized_next_for(res.last)
end
res
end
def words
@word_list
end
private :build_tables
end
if $0 == __FILE__
if ARGV[0].nil?
STDERR.puts "Please provide a corpus."
STDERR.puts "#$0 usage: #$0 <corpus file name> [chain length]
[initial word]"
exit 1
end
chain_len = (ARGV[1] || 10).to_i
mc = nil
File.open(ARGV[0]) do |file|
mc = MarkovChainBuilder.new(file)
end
init_word = (ARGV[2] || mc.words[ rand( mc.words.length ) ] )
chain = mc.markov_chain(init_word, chain_len)
puts chain.join(' ').capitalize
end
And now the third version which is basically the second version, but
looks for sentences:
% cat smarter_markov.rb
require 'markov2'
class SmarterMarkovChainBuilder < MarkovChainBuilder
def initialize(src)
super(src, /\s+/, /\S+/, false)
end
def markov_chain(initial_word, len = 10)
initial_chain = super
index_of_last_word = nil
initial_chain.each_index do |idx|
if initial_chain[idx] =~ /\.|\?|!/
index_of_last_word = idx
break
end
end
if index_of_last_word
initial_chain[0..index_of_last_word]
else
initial_chain
end
end
end
if $0 == __FILE__
mc = nil
File.open(ARGV[0]) do |file|
mc = SmarterMarkovChainBuilder.new(file)
end
start_words = mc.words.select { |w| w =~ /^[A-Z]/ }
init_word = start_words[ rand(start_words.length) ]
puts mc.markov_chain(init_word, 200).join(' ').gsub(/"/,'')
end