--------------090601070206080908080803
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
Here's my solution. It builds a tree of the Gedcom nodes.
It supports a broad subset of the Gedcom specification, can output XML,
YAML and pretty-print, has error checks and is reasonable short.
Note that the YAML representation will not reuse the IDs that were
specified in the original Gedcom file, but rather create its own. I
don't know if there is an easy way of making YAML use pre-specified IDs.
The XML representation uses <ref to ID@" /> for representing links.
The YAML and pp emitters blow up the stack when given the CPAN sample
data. There's not too much I can do about this.
The XML emitter tries hard to make the output as pretty as possible.
This includes trying to use value when appropriate. (It won't get
used when the value contains multi-line data.)
Data is read from ARGF which means either standard input or filenames
that where given on the command line.
I've also attached sample output for the file given on
http://heiner-eichmann.de/gedcom/simple.ged
--------------090601070206080908080803
Content-Type: text/plain;
name edcom.rb"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename edcom.rb"
module Gedcom
class ParseError < ArgumentError; end
class Node < Hash
attr_accessor :value, :origin, :special_type, :id
def special?() not @special_type.nil? end
def initialize(origin il)
@value, @origin il, origin
@as_plain_hash_cache ash.new
super() do |hash, key|
hash[key] rray.new
end
end
def hash
[@value.is_a?(Node) ? :recursive : @value, super].hash
end
def other) self.hash other.hash end
def replace(other)
super(other)
@value, origin ther.value, other.origin
end
# YAML detects self-referencing structures by comparing object_ids.
# as_plain_hash() needs to cache the Hash it creates to make that
# check work.
def as_plain_hash
if @as_plain_hash_cache.include?(self.hash)
@as_plain_hash_cache[self.hash]
else
result }.merge(self)
result.each do |key, values|
if values.size 1 then
result[key] alues.first
end
end
if not @value.nil? then
result[:value] value
end
@as_plain_hash_cache[self.hash] esult
end
end
private :as_plain_hash
def as_value
if @value.is_a?(String) and empty? then
@value
else
as_plain_hash
end
end
def to_yaml_type() "!map" end
def to_yaml(opts }) as_value.to_yaml(opts) end
def inspect() as_value.inspect end
def pretty_print(q) as_value.pretty_print(q) end
def to_xml(level )
require 'cgi'
indent " * (level + 1)
result f @value.is_a?(Node) then
"#{indent}<ref to #{@value.id}\" />"
else
self.map do |tag, nodes|
nodes.map do |node|
escaped_value f node.value.is_a?(String) then
CGI.escapeHTML(node.value.to_s)
end
id_attr ode.id.nil? ? "" : " id #{node.id}\""
xml_tag ag.downcase
if node.value.nil? and node.empty? then
"#{indent}<#{xml_tag}#{id_attr} />"
elsif node.empty? and escaped_value then
"#{indent}<#{xml_tag}#{id_attr}>" + escaped_value + "</#{xml_tag}>"
else
if node.value.is_a?(String) and node.value["\n"] then
"#{indent}<#{xml_tag}#{id_attr}>\n" +
"#{indent} #{node.value}\n" +
node.to_xml(level + 1) + "\n" +
"#{indent}</#{xml_tag}>"
else
val_attr ode.value.is_a?(String) ? " value #{escaped_value}\"" : ""
"#{indent}<#{xml_tag}#{id_attr}#{val_attr}>\n" +
node.to_xml(level + 1) + "\n" +
"#{indent}</#{xml_tag}>"
end
end
end.join("\n")
end.join("\n")
end
if level 0 then
result <gedcom>\n#{result}\n</gedcom>"
end
return result
end
end
LineRegexp ^\s*(\d+)\s+(?:(@\w[^@]*@)\s+)?(\w+)(?:\s+(?:(@\w[^@]*@)|(.+)))?\s*$/
def parse(data)
nodes ode.new(1)
stack nodes]
node_by_id ash.new
nodes_with_refs rray.new
data.each_with_index do |line, index|
line_no ndex + 1
if md ineRegexp.match(line) then
level, id, tag, value_id, value md.captures
level evel.to_i
value.gsub!("@@", "@") if value
if level > stack.size - 1 then
raise(ParseError, "Inconsistent nesting at line #{line_no}")
elsif level ! tack.size - 1 then
(stack.size - level - 1).times { stack.pop }
end
if stack.last.special? then
raise(ParseError, "Can't create sub node for special node " +
"of type #{stack.last.special_type} " +
"(defined at #{stack.last.origin}) at #{line_no}")
end
new_node ode.new(line_no)
if id and not id.empty? then
node_by_id[id] ew_node
new_node.id d
end
if value and not value.empty? then
new_node.value alue
elsif value_id and not value_id.empty? then
nodes_with_refs << new_node
# id is temporarily stored in value
new_node.value alue_id
end
case tag
when "CONC", "CONT" then
new_node.special_type ag
if id and not id.empty? then
raise(ParseError, "#{tag} node can't have id at line #{line_no}")
end
str_value value and not value.empty?) ? value : value_id
separator ase tag
when "CONC" then ""
when "CONT" then "\n"
end
stack.last.value tack.last.value.to_s + separator + str_value.to_s
end
unless new_node.special?
stack.last[tag] << new_node
end
stack << new_node
elsif line.strip.empty? then
# Ignore, line contains whitespace only
else
raise(ParseError, "Parse error at line #{line_no}")
end
end
nodes_with_refs.each do |node|
id ode.value
if node_by_id.include?(id) then
node.value ode_by_id[id]
else
raise(ParseError, "Pointer to undefined node `#{id}' at line #{node.origin}")
end
end
return nodes
end
module_function :parse
end
if __FILE__ $0 then
data RGF.read
require 'pp'
puts "Pretty-printed:"
begin
pp Gedcom.parse(data)
rescue SystemStackError
puts "Sorry, pp blowed up the stack."
end
require 'yaml'
puts "", "As YAML:"
begin
y Gedcom.parse(data)
rescue SystemStackError
puts "Sorry, YAML blowed up the stack."
end
puts "", "As XML:"
puts Gedcom.parse(data).to_xml
end
--------------090601070206080908080803--