Dave Thomas (Dave / PragmaticProgrammer.com) wrote: [...] > Something I'd like is queries such as > > article contains: +matz +irb -python > > I know I can do this using egrep n times and combining the results, > but I'd really rather not. Given that you're working with a small set of files, why don't you use a simple cross reference database and set operations? See below for a quick and dirty implementation of one (it uses Dan Bernstein's cdb database for a good compromise between size and speed, but should be adaptable to Berkeley DB or GDBM). Note that the size of the cross reference index will be on the order of magnitude of the original data, so it won't work too well for large sets of files. On the other hand, if you don't mind wasting some more space, phrase searching can be implemented at essentially no additional CPU cost. Reimer Behrends #!/usr/bin/env ruby # See end of file for license. require "cdb" # Dan Bernstein's constant database require "find" # Regular expression for words. Should be case-insensitive. WORDPAT = /\b[a-z0-9](?:[a-z0-9'-]*[a-z0-9]|)\b/i # Key in the DB where the file list is stored. INDEX_FILES = "__files__" # gather_index: list of files -> cross reference index # optional block argument to filter file content before scanning # for words. Example: # gather_index(Dir[SOURCE+"/**/*.html"]+Dir[SOURCE+"/**/*.txt"]) do # | filename, data | # if filename =~ /\.html$/ then # extern_filter("html2text -nobs", data) # else # data # end # end # # The resulting index uses numbers instead of file names to reference # files. def gather_index(files) result = {} index = 0 files.each do | file | contents = File.open(file){|fp| fp.read} contents = yield file, contents if block_given? contents.scan(WORDPAT) do | word | word.downcase! result[word] ||= [] list = result[word] if list[-1] != index then list << index end end index += 1 end # The following code will remove common words from the database if # that is desired ("a", "the", "you", ...). #limit = [files.size*2/3,20].max #result.keys.each do # | word | # result.delete word if result[word].size >= limit #end result[INDEX_FILES] = files result end # search: Arguments are: the cross reference index (or any object with # the same basic semantics as the hash and the same contents), lists of # words that must occur, must not occur, and may occur. Result is an # array of file names that satisfy these criteria. def search(index, words=[], exclude=[], oneof=[]) files = index[INDEX_FILES] if words.size == 0 and oneof.size == 0 then return [] end result_and = nil # intersection of all sets of files that contain words in 'words' if words.size > 0 then set = index[words[0]] if set then result_and = set else puts "Not found: \"#{words[0]}\"" end words[1..-1].each do | word | set = index[word] if not set then puts "Not found: \"#{word}\"" next end if result_and then result_and &= set else result_and = set end end end # union of all sets of files that occur in 'oneof' # In addition, calculate weights depending on how many and which terms # are matched. Files that contain more search terms have a higher # weight. If files match the same number of search terms, terms that # occur earlier are given priority. result_or = [] weights = Hash.new(0) weight = oneof.size**2 + oneof.size oneof.each do | word | set = index[word] if not set then puts "Not found: \"#{word}\"" next end result_or |= set set.each do | file | weights[file] += weight end weight -= 1 end # remove all files that contain words in the 'exclude' list. result = result_and || result_or exclude.each do | word | set = index[word] if not set then puts "Not found: \"#{word}\"" next end result -= set end # sort according to weighting criteria and replace indices with # file names. result.sort! {|i1, i2| [-weights[i1], i1] <=> [-weights[i2], i2] } result.map {|i| files[i]} end # marshalled_access adds a [] method that automatically unmarshals the # result. def marshalled_access(index) class <<index def [](key) result = super(key) result and Marshal.load(result) end end end # all_files expands path names on the list and replaces directories # by a list of files that can be found under them. def all_files(list) result = [] list = list.map{|path| File.expand_path path} list.each do | path | if File.file? path then result << path else Find.find(path) do | path2 | result << path2 if File.file? path2 end end end result end # main program op = ARGV.shift indexfile = ARGV.shift case op when "make" then if File.exists? indexfile then $stderr.puts "Index file \"#{indexfile}\" exists -- aborting." exit 1 end files = all_files(ARGV) index = gather_index(files) # We write the index as a database CDB.create(indexfile, indexfile+".tmp") do | db | index.each_pair do | key, value | db[key] = Marshal.dump(value) end end when "search" then # We open the database and disguise it as a hash, rather than a # string -> string mapping with marshalled data. index = CDB.new(indexfile) marshalled_access(index) words = [] oneof = [] exclude = [] ARGV.each do | word | word = word.downcase case word when /^\+/ then oneof << word[1..-1] when /^-/ then exclude << word[1..-1] else words << word end end search(index, words, exclude, oneof).each do | file | puts file end else puts "Usage:" puts "#{$0} make <indexfile> <file...>" puts "#{$0} search <indexfile> <term...> <+term...> <-term...>" puts " Prefix + denotes a word that may occur." puts " Prefix - denotes a word that must not occur." puts " All other words must occur." exit 1 end # Copyright (c) 2002 Reimer Behrends # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.