使用每个密钥的行号加载YAML

时间:2015-04-05 22:16:41

标签: ruby yaml

假设我有一个YAML文件,如下所示:

  en:
    errors:
      # Some comment
      format: "%{attribute} %{message}"

      # One more comment
      messages:
        "1": "Message 1"
        "2": "Message 2"

    long_error_message: |
      This is a
      multiline message

    date:
      format: "YYYY-MM-DD"

我怎样才能把它读成像这样的Ruby Hash

{
  'en': {
    'errors': {
      'format': { value: '%{attribute} %{message}', line: 4 }
      'messages': {
        '1': { value: 'Message 1', line: 8 },
        '2': { value: 'Message 2', line: 9 }
      }
      'long_error_message' : { value: "This is a\nmultiline message", line: 11 }
    },
    'date': {
      'format': { value: 'YYYY-MM-DD', line: 16 }
    }
  }
}

我尝试使用YAML: Find line number of key?中提到的提示作为起点并实现了Psych::Handler,但感觉我必须从Psych重写大量代码才能使其工作

我有什么想法可以解决这个问题吗?

4 个答案:

答案 0 :(得分:5)

看起来您想要采用任何标量值作为映射值,并将其替换为包含原始值的value键的哈希值,以及带有行号的line键。

以下几乎可行,主要问题是多行字符串,其中给出的行号是Yaml中下一个事物的开始。问题是,当调用处理程序scalar方法时,解析器已经超出了感兴趣的标量,因此mark在知道标量已结束时给出位置的行。在大多数情况下,在您的示例中这并不重要,但在多行情况下,它会给出错误的值。在没有进入Psych C代码的情况下,我无法看到从mark获取解析器信息的任何方法,无需进入Psych C代码。

require 'psych'

# Psych's first step is to parse the Yaml into an AST of Node objects
# so we open the Node class and add a way to track the line.
class Psych::Nodes::Node
  attr_accessor :line
end

# We need to provide a handler that will add the line to the node
# as it is parsed. TreeBuilder is the "usual" handler, that
# creates the AST.
class LineNumberHandler < Psych::TreeBuilder

  # The handler needs access to the parser in order to call mark
  attr_accessor :parser

  # We are only interested in scalars, so here we override 
  # the method so that it calls mark and adds the line info
  # to the node.
  def scalar value, anchor, tag, plain, quoted, style
    mark = parser.mark
    s = super
    s.line = mark.line
    s
  end
end

# The next step is to convert the AST to a Ruby object.
# Psych does this using the visitor pattern with the ToRuby
# visitor. Here we patch ToRuby rather than inherit from it
# as it makes the last step a little easier.
class Psych::Visitors::ToRuby

  # This is the method for creating hashes. There may be problems
  # with Yaml mappings that have tags.
  def revive_hash hash, o
    o.children.each_slice(2) { |k,v|
      key = accept(k)
      val = accept(v)

      # This is the important bit. If the value is a scalar,
      # we replace it with the desired hash.
      if v.is_a? ::Psych::Nodes::Scalar
        val = { "value" => val, "line" => v.line + 1} # line is 0 based, so + 1
      end

      # Code dealing with << (for merging hashes) omitted.
      # If you need this you will probably need to copy it
      # in here. See the method:
      # https://github.com/tenderlove/psych/blob/v2.0.13/lib/psych/visitors/to_ruby.rb#L333-L365

      hash[key] = val
    }
    hash
  end
end

yaml = get_yaml_from_wherever

# Put it all together    
handler = LineNumberHandler.new
parser =  Psych::Parser.new(handler)
# Provide the handler with a reference to the parser
handler.parser = parser

# The actual parsing
parser.parse yaml
# We patched ToRuby rather than inherit so we can use to_ruby here
puts handler.root.to_ruby

答案 1 :(得分:3)

我建议你选择@ matt的解决方案。除此之外,它更加谨慎,它可以正确处理标量。


诀窍可能是monkeypatch TreeBuilder#scalar方法:

y='
en:
  errors:
    # Some comment
    format: "%{attribute} %{message}"

    # One more comment
    messages:
      "1": "Message 1"
      "2": "Message 2"

  long_error_message: |
    This is a
    multiline message

  date:
    format: "YYYY-MM-DD"'

require 'yaml'

yphc = Class.new(YAML.parser.handler.class) do
  def scalar value, anchor, tag, plain, quoted, style
    value = { value: value, line: $line } if style > 1 
    $line = $parser.mark.line + 1  # handle multilines properly
    super value, anchor, tag, plain, quoted, style
  end 
end

$parser = Psych::Parser.new(yphc.new)

# more careful handling required for multidocs    
result = $parser.parse(y).handler.root.to_ruby[0]

实际上,我们差不多完成了。唯一剩下的就是在仅有叶子的中留下带有行号的修补值。我没有故意把这个逻辑放在解析内容中。

def unmark_keys hash
  hash.map do |k,v|
    [k.is_a?(Hash) ? k[:value] : k, v.is_a?(Hash) ? unmark_keys(v) : v]
  end.to_h
end

p unmark_keys result

#⇒ {"en"=>
#⇒   {"errors"=>
#⇒     {
#⇒       "format"=>{:value=>"%{attribute} %{message}", :line=>4},
#⇒       "messages"=>
#⇒          {
#⇒            "1"=>{:value=>"Message 1", :line=>8}, 
#⇒            "2"=>{:value=>"Message 2", :line=>9}
#⇒       }
#⇒     }, 
#⇒     "long_error_message"=>{
#⇒        :value=>"This is a\nmultiline message\n", :line=>11
#⇒     }, 
#⇒     "date"=>{"format"=>{:value=>"YYYY-MM-DD", :line=>16}}
#⇒   }
#⇒ }

当然有人可能想要摆脱全局变量等。我试图尽可能保持核心实现的清洁。

我们走了。希望它有所帮助。

UPD 感谢@matt,上面的代码在标量上失败了:

key1:
  val1

key2: val2

YAML允许使用此语法,但上述方法无法正确处理。不会为此返回任何行。除了无理由缺乏标量支持外,还会正确报告其他任何内容,请参阅此答案的评论以获取更多详细信息。

答案 2 :(得分:2)

我已经采用了@ matt的解决方案并创建了一个不需要mankey修补的版本。它还处理跨越多行和YAML的<<运算符的值。

require "psych"
require "pp"

ValueWithLineNumbers = Struct.new(:value, :lines)

class Psych::Nodes::ScalarWithLineNumber < Psych::Nodes::Scalar
  attr_reader :line_number

  def initialize(*args, line_number)
    super(*args)
    @line_number = line_number
  end
end

class Psych::TreeWithLineNumbersBuilder < Psych::TreeBuilder
  attr_accessor :parser

  def scalar(*args)
    node = Psych::Nodes::ScalarWithLineNumber.new(*args, parser.mark.line)
    @last.children << node
    node
  end
end

class Psych::Visitors::ToRubyWithLineNumbers < Psych::Visitors::ToRuby
  def visit_Psych_Nodes_ScalarWithLineNumber(node)
    visit_Psych_Nodes_Scalar(node)
  end

  private

  def revive_hash(hash, node)
    node.children.each_slice(2) do |k, v|
      key = accept(k)
      val = accept(v)

      if v.is_a? Psych::Nodes::ScalarWithLineNumber
        start_line = end_line = v.line_number + 1

        if k.is_a? Psych::Nodes::ScalarWithLineNumber
          start_line = k.line_number + 1
        end
        val = ValueWithLineNumbers.new(val, start_line..end_line)
      end

      if key == SHOVEL && k.tag != "tag:yaml.org,2002:str"
        case v
        when Psych::Nodes::Alias, Psych::Nodes::Mapping
          begin
            hash.merge! val
          rescue TypeError
            hash[key] = val
          end
        when Psych::Nodes::Sequence
          begin
            h = {}
            val.reverse_each do |value|
              h.merge! value
            end
            hash.merge! h
          rescue TypeError
            hash[key] = val
          end
        else
          hash[key] = val
        end
      else
        hash[key] = val
      end
    end

    hash
  end
end

# Usage:
handler = Psych::TreeWithLineNumbersBuilder.new
handler.parser = Psych::Parser.new(handler)

handler.parser.parse(yaml)

ruby_with_line_numbers = 
Psych::Visitors::ToRubyWithLineNumbers.create.accept(handler.root)

pp ruby_with_line_numbers

我发布了gist of the above以及一些评论和示例

答案 3 :(得分:0)

我们可以手动添加数字,方法是通过Psych提供的解析散列进行递归并找到每个键的行号。以下代码将与您指定的结果匹配。

require 'psych'

def add_line_numbers(lines, hash)
  # Ruby cannot iterate and modify a hash at the same time.
  # So we dup the hash and iterate over the dup.
  iterator = hash.dup
  iterator.each do |key, value|
    if value.is_a?(Hash)
      add_line_numbers(lines, value)
    else
      index = lines.index { |line| line =~ /^\s.?*#{key}.?\:/ }
      hash[key] = { "value" => value, "line" => (index + 1) }
    end
  end
end

yaml_file = File.expand_path('../foo.yml', __FILE__)
lines = File.readlines(yaml_file)
data = Psych.load(lines.join("\n"))
add_line_numbers(lines, data)
puts data