Hello all,


The below code is too slow for practical use. I need it to run at least
20 times faster. Perhaps that is possible with some C code? I have no
experience with writing Ruby extensions. What are the pitfalls? Which
part of the code should be ported? Any pointers to get me started?

Cheers,


Martin


#!/usr/bin/env ruby

# IUPAC nucleotide pair ambiguity equivalents are saved in an
# array of bit fields.

BIT_A = 1 << 0
BIT_T = 1 << 1
BIT_C = 1 << 2
BIT_G = 1 << 3

EQUAL          = Array.new(256, 0)
EQUAL['A'.ord] = BIT_A
EQUAL['T'.ord] = BIT_T
EQUAL['U'.ord] = BIT_T
EQUAL['C'.ord] = BIT_C
EQUAL['G'.ord] = BIT_G
EQUAL['M'.ord] = (BIT_A|BIT_C)
EQUAL['R'.ord] = (BIT_A|BIT_G)
EQUAL['W'.ord] = (BIT_A|BIT_T)
EQUAL['S'.ord] = (BIT_C|BIT_G)
EQUAL['Y'.ord] = (BIT_C|BIT_T)
EQUAL['K'.ord] = (BIT_G|BIT_T)
EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)
EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)
EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)
EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)
EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)

# Module containing code to locate nucleotide patterns in sequences
allowing for
# ambiguity codes and a given maximum edit distance.
# Insertions are nucleotides found in the pattern but not in the
sequence.
# Deletions are nucleotides found in the sequence but not in the
pattern.
#
# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
module PatternMatcher
  #
------------------------------------------------------------------------------
  #   str.match(pattern[, pos[, max_edit_distance]])
  #   -> Match or nil
  #
  #
------------------------------------------------------------------------------
  # Method to locate the next pattern match starting from a given
position. A match
  # is allowed to contain a given maximum edit distance. If a match is
located a
  # Match object will be returned otherwise nil.
  def match(pattern, pos = 0, max_edit_distance = 0)
    @pattern           = pattern
    @pos               = pos
    @max_edit_distance = max_edit_distance
    @vector            = vector_init

    while @pos < @seq.length
      vector_update

      return match_new if match_found?

      @pos += 1
    end
  end

  #
------------------------------------------------------------------------------
  #   str.scan(pattern[, pos[, max_edit_distance]])
  #   -> Array
  #   str.scan(pattern[, pos[, max_edit_distance]]) { |match|
  #     block
  #   }
  #   -> Match
  #
  #
------------------------------------------------------------------------------
  # Method to iterate through a sequence to locate pattern matches
starting
  # from a given position and allowing for a maximum edit distance.
  # Matches found in block context return the Match object. Otherwise
matches are
  # returned in an Array.
  def scan(pattern, pos = 0, max_edit_distance = 0)
    matches = []

    while match = match(pattern, pos, max_edit_distance)
      if block_given?
        yield match
      else
        matches << match
      end

      pos = match.pos + 1
    end

    return matches unless block_given?
  end

  private

  # Method to initailize the score vector and return this.
  def vector_init
    vector = []

    (0 ... @pattern.length + 1).each do |i|
      vector[i] = Score.new(matches = 0, mismatches = 0, insertions = i)
    end

    vector
  end

  # Method to update the score vector.
  def vector_update
    new_vector = @vector.dup

    (0 ... @pattern.length).each do |i|
      if match?(@seq[@pos], @pattern[i])
        new_vector[i + 1] = @vector[i].dup
        new_vector[i + 1].matches += 1
      else
        mismatch  = @vector[i].dup
        insertion = new_vector[i].dup
        deletion  = @vector[i + 1].dup

        if deletion?(mismatch, insertion, deletion)
          deletion.deletions += 1
          new_vector[i + 1] = deletion
        elsif mismatch?(mismatch, insertion, deletion)
          mismatch.mismatches += 1
          new_vector[i + 1] = mismatch
        elsif insertion?(mismatch, insertion, deletion)
          insertion.insertions += 1
          new_vector[i + 1] = insertion
        end
      end
    end

    @vector = new_vector
  end

  # Method to determine if a match occurred.
  def match?(char1, char2)
    (EQUAL[char1.upcase.ord] & EQUAL[char2.upcase.ord]) != 0
  end

  # Method to determine if a mismatch occured.
  def mismatch?(mismatch, insertion, deletion)
    if mismatch.edit_distance <= insertion.edit_distance and
       mismatch.edit_distance <= deletion.edit_distance
      true
    end
  end

  # Method to determine if an insertion occured.
  def insertion?(mismatch, insertion, deletion)
    if insertion.edit_distance <= mismatch.edit_distance and
       insertion.edit_distance <= deletion.edit_distance
      true
    end
  end

  # Method to determine if a deletion occured.
  def deletion?(mismatch, insertion, deletion)
    if deletion.edit_distance <= mismatch.edit_distance and
       deletion.edit_distance <= insertion.edit_distance
      true
    end
  end

  # Method to print the score vector.
  def vector_print
    @vector.each do |s|
      puts s
    end

    puts
  end

  # Method that returns a Match object initialized with
  # information from the score vector.
  def match_new
    matches    = @vector.last.matches
    mismatches = @vector.last.mismatches
    insertions = @vector.last.insertions
    deletions  = @vector.last.deletions
    length     = @pattern.length - insertions + deletions
    pos        = @pos - length + 1
    match      = @seq[pos ... pos + length]

    Match.new(pos, match, matches, mismatches, insertions, deletions,
length)
  end

  # Method that determines if a match was found by analyzing the score
vector.
  def match_found?
    if @vector.last.edit_distance <= @max_edit_distance
      true
    end
  end

  # Class to instantiate Score objects that holds score information.
  class Score
    attr_accessor :matches, :mismatches, :insertions, :deletions

    def initialize(matches = 0, mismatches = 0, insertions = 0,
deletions = 0)
      @matches    = matches
      @mismatches = mismatches
      @insertions = insertions
      @deletions  = deletions
    end

    # Method to calculate and return the edit distance.
    def edit_distance
      self.mismatches + self.insertions + self.deletions
    end

    private

    def to_s
      "(#{[self.matches, self.mismatches, self.insertions,
self.deletions].join(',')})"
    end
  end

  # Class for creating Match objects which contain the description of a
  # match between a nucleotide sequence and a pattern.
  class Match
    attr_reader :pos, :match, :matches, :mismatches, :insertions,
:deletions, :edit_distance, :length

    def initialize(pos, match, matches, mismatches, insertions,
deletions, length)
      @pos           = pos
      @match         = match
      @matches       = matches
      @mismatches    = mismatches
      @insertions    = insertions
      @deletions     = deletions
      @edit_distance = mismatches + insertions + deletions
      @length        = length
    end
  end
end

-- 
Posted via http://www.ruby-forum.com/.