On 10/15/07, Jes=FAs Gabriel y Gal=E1n <jgabrielygalan / gmail.com> wrote:
> On 10/12/07, Ruby Quiz <james / grayproductions.net> wrote:
> > by Benjohn Barnes
> >
> > Usually, you write a regular expression and then search for it in a tex=
t string.
> > How about doing the opposite? Given a regular expression, generate all =
of the
> > strings that it will match.

I made a second version, supporting:

> - Ranges in char groups [a-zA-Z]
> - Backreferences: /(abc)\1/, didn't even thought about them, but
> probably a nightmare.

I also support the {<number>} modifier, previously I only supported
{<num1>,<num2>}.

Backreferences were indeed a nightmare, but I managed a solution. I
tried first to integrate them seemlessly in my architecture with no
luck, since the generation of the groups is isolated from one another,
and there was no easy way. So I settled for the following:

Multigroups will carry a number depending on appearance.
When I generate the result for a multigroup, I add the result of that
group to each string.
When I generate a backreference, I generate a special syntax
(__<num>__) to gsub in a second step, when all groups have been
generated and stored. This can of course fail if the regexp matches
that syntax, since I will later on try to sub things that I shouldn't
(any ideas here are welcome).
In the second step I gsub every occurrence of __<num>__ with the
appropriate group value for that string.

Here's the code and some examples of the new things:

# Number of times to repeat for Star and Plus repeaters
TIMES =3D 2

# Set of chars for Dot and negated [^] char groups
#CHARS =3D [("a".."z").to_a, ("A".."Z").to_a, ".", ",", ";"].flatten
CHARS =3D %w{a b c d e}

class OneTimeRepeater
  def initialize(group)
    @group =3D group
  end

  def result
    @group.result
  end
end

class StarRepeater
  def initialize(group)
    @group =3D group
  end

  def result
    r =3D []
    group_res =3D @group.result
    group_res.unshift("")
    TIMES.times do
      r << group_res
    end
    combine(r).uniq
  end
end

class PlusRepeater
  def initialize(group)
    @group =3D group
  end

  def result
    group_res =3D @group.result
    r =3D [group_res]
    temp =3D [""].concat(group_res)
    (TIMES - 1).times do
      r << temp
    end
    combine(r).uniq
  end
end

class QuestionMarkRepeater
  def initialize(group)
    @group =3D group
  end

  def result
    @group.result.unshift("")
  end
end

class RangeRepeater
  def initialize(group,min,max)
    @group =3D group
    @min =3D min
    @max =3D max
  end

  def result
    result =3D @group.result
    r =3D []
    r << [""] if @min =3D=3D 0
    @min.times {r << result}
    temp =3D result.dup.unshift("")
    if @max
      (@max - @min).times do
        r << temp
      end
    end
    combine(r).uniq
  end
end

class SingleChar
  def initialize(c)
    @c =3D c
  end
  def result
    [@c]
  end
end

class CharGroup
  def initialize(chars)
    @chars =3D chars
    if chars[0] =3D=3D "^"
      @negative =3D true
      @chars =3D @chars[1..-1]
    else
      @negative =3D false
    end

    # Ranges a-b
    # save first and last "-" if present
    first =3D nil
    last =3D nil
    first =3D @chars.shift if @chars.first =3D=3D "-"
    last =3D @chars.pop if @chars.last =3D=3D "-"
    while i =3D @chars.index("-")
      @chars[i-1..i+1] =3D (@chars[i-1]..@chars[i+1]).to_a
    end
    # restore them back
    @chars.unshift(first) if first
    @chars.push(last) if last
  end
  def result
    if @negative
      CHARS - @chars
    else
      @chars
    end
  end
end

class Dot
  def result
    CHARS
  end
end

class MultiGroup
  attr_reader :group_num
  def initialize(groups, group_num)
    @groups =3D groups
    @group_num =3D group_num
  end

  # Generates the result of each contained group
  # and adds the filled group of each result to
  # itself
  def result
    strings =3D @groups.map {|x| x.result}
    result =3D combine(strings)
    result.each {|x| x.add_filled_group(@group_num, x)}
    result
  end
end

class OrGroup
  def initialize(first_groupset, second_groupset)
    @first =3D first_groupset
    @second =3D second_groupset
  end

  def result
    strings =3D @first.map {|x| x.result}
    s =3D combine(strings)
    strings =3D @second.map {|x| x.result}
    s.concat(combine(strings))
  end
end

class BackReference
  attr_reader :num
  def initialize(num)
    @num =3D num
  end

  def result
    ["__#{@num}__"]
  end
end

# Combines arrays, concatenating each string
# merging the possible groups they have
# Starts combining the first two arrays, then goes on
# combining each other array to the result of the
# previous combination
def combine(arrays)
 string =3D arrays.inject do |r, rep|
   temp =3D []
   r.each {|aa| rep.each {|bb| temp << (aa.concat_and_merge_groups(bb))}}
   temp
 end
 string
end

class String
  attr_accessor :filled_groups

  def add_filled_group(num, group)
    @filled_groups ||=3D {}
    @filled_groups[num] =3D group
  end

  def concat_and_merge_groups(other)
    temp =3D self + other
    groups =3D {}
    groups.merge!(self.filled_groups) if self.filled_groups
    groups.merge!(other.filled_groups) if other.filled_groups
    temp.filled_groups =3D groups
    temp
  end

end

class Regexp
  attr_reader :num_groups
  def parse(s, i =3D 0)
    repeaters =3D []
    group =3D nil
    while i < s.length
      char =3D s[i].chr
      case char
      when '('
        num =3D @num_groups + 1
        @num_groups +=3D 1
        groups, i =3D parse(s, i+1)
        group =3D MultiGroup.new(groups, num)
      when ')'
        return repeaters,i
      when '['
        chars =3D []
        i +=3D 1
        until s[i].chr =3D=3D ']'
          chars << s[i].chr
          i +=3D 1
        end
        group =3D CharGroup.new(chars)
      when '.'
        group =3D Dot.new
      when '|'
        groups, i =3D parse(s, i + 1)
        group =3D OrGroup.new(repeaters, groups)
        return [group], i
      when '\\'
        i +=3D 1
        p s[i..-1]
        m =3D s[i..-1].match(/^(\d+)/)
        if m
          group =3D BackReference.new(m[0].to_i)
          i +=3D m[0].size - 1
        end
      else
        group =3D SingleChar.new(char)
      end

      repeater =3D nil
      i +=3D 1
      if i < s.length
        case s[i].chr
        when '*'
          repeater =3D StarRepeater.new(group)
        when '+'
          repeater =3D PlusRepeater.new(group)
        when '?'
          repeater =3D QuestionMarkRepeater.new(group)
        when '{'
          m =3D s[i..-1].match(/\{(\d+)(,(\d+))?\}/)
          first =3D m[1].to_i if m[1]
          second =3D m[3].to_i if m[3]
          repeater =3D RangeRepeater.new(group, first, second)
          i +=3D m[0].size - 1
        else
          repeater =3D OneTimeRepeater.new(group)
          i -=3D 1
        end
        i +=3D 1
      else
        repeater =3D OneTimeRepeater.new(group)
      end
      repeaters << repeater
    end
    return repeaters, i
  end

  def generate
    @num_groups =3D 0
    r =3D self.inspect[1..-2]
    repeaters, _ =3D self.parse(r)
    strings =3D repeaters.map {|x| x.result}
    s =3D combine(strings)
    # Makes a pass for the backreferences
    s.each do |string|
      string.gsub!(/__(\d+)__/) do |match|
        string.filled_groups[$1.to_i]
      end
    end
    s
  end
end

def show(regexp)
  s =3D regexp.generate
  puts "#{regexp.inspect} --> #{s.inspect}"
  puts "Checking..."
  errors =3D s.reject {|string| string =3D~ regexp}
  if errors.size =3D=3D 0
    puts "All strings match"
  else
    puts "These don't match: #{errors.inspect}"
  end
end


Samples:

show(/a{3}bcd/)
show(/a[a-d1-9-]/)
show(/a[-a-c1-3]/)
show(/a(b|c)d\1xyz/)
show(/(a)(b)(c)(d)(e)(f)(g)(h)(i)(jjjj|xxxx)(k)\10/)

/a{3}bcd/ --> ["aaabcd"]

/a[a-d1-9-]/ --> ["aa", "ab", "ac", "ad", "a1", "a2", "a3", "a4",
"a5", "a6", "a7", "a8", "a9", "a-"]

/a[-a-c1-3]/ --> ["a-", "aa", "ab", "ac", "a1", "a2", "a3"]

/a(b|c)d\1xyz/ --> ["abdbxyz", "acdcxyz"]

/(a)(b)(c)(d)(e)(f)(g)(h)(i)(jjjj|xxxx)(k)\10/ -->
["abcdefghijjjjkjjjj", "abcdefghixxxxkxxxx"]

Regards,

Jesus.