Dave Thomas (Dave / PragmaticProgrammer.com) wrote:
[...]
>  Something I'd like is queries such as
>  
>    article contains:  +matz +irb -python
>  
>  I know I can do this using egrep n times and combining the results,
>  but I'd really rather not.

Given that you're working with a small set of files, why don't you use a
simple cross reference database and set operations? See below for a
quick and dirty implementation of one (it uses Dan Bernstein's cdb
database for a good compromise between size and speed, but should be
adaptable to Berkeley DB or GDBM). Note that the size of the cross
reference index will be on the order of magnitude of the original data,
so it won't work too well for large sets of files. On the other hand, if
you don't mind wasting some more space, phrase searching can be
implemented at essentially no additional CPU cost.

			Reimer Behrends


#!/usr/bin/env ruby

# See end of file for license.

require "cdb"		# Dan Bernstein's constant database
require "find"

# Regular expression for words. Should be case-insensitive.

WORDPAT = /\b[a-z0-9](?:[a-z0-9'-]*[a-z0-9]|)\b/i

# Key in the DB where the file list is stored.

INDEX_FILES = "__files__"

# gather_index: list of files -> cross reference index
# optional block argument to filter file content before scanning
# for words. Example:
# gather_index(Dir[SOURCE+"/**/*.html"]+Dir[SOURCE+"/**/*.txt"]) do
#   | filename, data |
#   if filename =~ /\.html$/ then
#     extern_filter("html2text -nobs", data)
#   else
#     data
#   end
# end
# 
# The resulting index uses numbers instead of file names to reference
# files.

def gather_index(files)
  result = {}
  index = 0
  files.each do
    | file |
    contents = File.open(file){|fp| fp.read}
    contents = yield file, contents if block_given?
    contents.scan(WORDPAT) do
      | word |
      word.downcase!
      result[word] ||= []
      list = result[word]
      if list[-1] != index then
        list << index
      end
    end
    index += 1
  end
  # The following code will remove common words from the database if
  # that is desired ("a", "the", "you", ...).

  #limit = [files.size*2/3,20].max
  #result.keys.each do
  #  | word |
  #  result.delete word if result[word].size >= limit
  #end

  result[INDEX_FILES] = files
  result
end

# search: Arguments are: the cross reference index (or any object with
# the same basic semantics as the hash and the same contents), lists of
# words that must occur, must not occur, and may occur. Result is an
# array of file names that satisfy these criteria.

def search(index, words=[], exclude=[], oneof=[])
  files = index[INDEX_FILES]
  if words.size == 0 and oneof.size == 0 then
    return []
  end
  result_and = nil
  # intersection of all sets of files that contain words in 'words'
  if words.size > 0 then
    set = index[words[0]]
    if set then
      result_and = set
    else
      puts "Not found: \"#{words[0]}\""
    end
    words[1..-1].each do
      | word |
      set = index[word]
      if not set then
        puts "Not found: \"#{word}\""
	next
      end
      if result_and then
	result_and &= set
      else
        result_and = set
      end
    end
  end
  # union of all sets of files that occur in 'oneof'
  # In addition, calculate weights depending on how many and which terms
  # are matched. Files that contain more search terms have a higher
  # weight. If files match the same number of search terms, terms that
  # occur earlier are given priority.
  result_or = []
  weights = Hash.new(0)
  weight = oneof.size**2 + oneof.size
  oneof.each do
    | word |
    set = index[word]
    if not set then
      puts "Not found: \"#{word}\""
      next
    end
    result_or |= set
    set.each do | file | weights[file] += weight end
    weight -= 1
  end
  # remove all files that contain words in the 'exclude' list.
  result = result_and || result_or
  exclude.each do
    | word |
    set = index[word]
    if not set then
      puts "Not found: \"#{word}\""
      next
    end
    result -= set
  end
  # sort according to weighting criteria and replace indices with
  # file names.
  result.sort! {|i1, i2| [-weights[i1], i1] <=> [-weights[i2], i2] }
  result.map {|i| files[i]}
end

# marshalled_access adds a [] method that automatically unmarshals the
# result.

def marshalled_access(index)
  class <<index
    def [](key)
      result = super(key)
      result and Marshal.load(result)
    end
  end
end

# all_files expands path names on the list and replaces directories
# by a list of files that can be found under them.

def all_files(list)
  result = []
  list = list.map{|path| File.expand_path path}
  list.each do
    | path |
    if File.file? path then
      result << path
    else
      Find.find(path) do
        | path2 |
	result << path2 if File.file? path2
      end
    end
  end
  result
end

# main program

op = ARGV.shift
indexfile = ARGV.shift

case op
when "make" then
  if File.exists? indexfile then
    $stderr.puts "Index file \"#{indexfile}\" exists -- aborting."
    exit 1
  end
  files = all_files(ARGV)
  index = gather_index(files)
  # We write the index as a database
  CDB.create(indexfile, indexfile+".tmp") do
    | db |
    index.each_pair do
      | key, value |
      db[key] = Marshal.dump(value)
    end
  end
when "search" then
  # We open the database and disguise it as a hash, rather than a
  # string -> string mapping with marshalled data.
  index = CDB.new(indexfile)
  marshalled_access(index)
  words = []
  oneof = []
  exclude = []
  ARGV.each do
    | word |
    word = word.downcase
    case word
    when /^\+/ then
      oneof << word[1..-1]
    when /^-/ then
      exclude << word[1..-1]
    else
      words << word
    end
  end
  search(index, words, exclude, oneof).each do
    | file |
    puts file
  end
else
  puts "Usage:"
  puts "#{$0} make <indexfile> <file...>"
  puts "#{$0} search <indexfile> <term...> <+term...> <-term...>"
  puts "  Prefix + denotes a word that may occur."
  puts "  Prefix - denotes a word that must not occur."
  puts "  All other words must occur."
  exit 1
end

# Copyright (c) 2002 Reimer Behrends

# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:

# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.