Ruby:正则表达式在新行字符上出错

时间:2013-12-31 04:37:05

标签: ruby regex

我有以下课程:

require 'strscan'

class ConfParser

  class Error < StandardError; end
  VERSION = '0.0.1'
  SECTION_REGEX = /^\[       # Opening bracket
                   ([^\]]+)  # Section name
                   \]$       # Closing bracket
                 /x

  attr_accessor :filename, :config

  def initialize(opts = {})
    @file = opts.fetch(:filename)
    @separator = opts.fetch(:separator, ":")
    @content = nil
    @config = Hash.new { |h,k| h[k] = Hash.new }

    load
  end

  def load
    raise_error("First line of config file contain be blank") if first_line_empty?

    return unless File.file?(@file)

    f = File.open(@file, 'r')
    @content = f.read
    parse!

    self

    ensure
      f.close if f && !f.closed?
  end

  def sections
    @config.keys
  end

  def [](section)
    return nil if section.nil?

    @config[section.to_s]
  end

  def []=( section, value )
    @config[section.to_s] = value
  end

  def write
    File.open(@file, 'w') do |f|
      @config.each do |section, hash|
        f.puts "[#{section}]"
        hash.each { |param, val| f.puts "#{param}: #{escape_value val}" }
        f.puts
      end
    end

    self
  end

  private

    def parse!
      @_section = nil
      @_current_line = nil
      property = ''
      string = ''

      scanner = StringScanner.new(@content)

      # Keep going until EndOfString
      until scanner.eos?
        # Let's keep up with the current line for debugging purposes
        @_current_line = scanner.check(%r/\A.*$/) if scanner.bol?

        # If a new section? Write it to memory
        if scanner.scan(SECTION_REGEX)
          section = scanner[1].strip
          @_section = @config[section] unless section_exists?(section)

        # if a quoted string, process
        elsif scanner.scan(%r/"/)
          quote = scanner.scan_until(/(?:\A|[^\\])"/)
          raise_error('Unmatched quote') if quote.nil?

          quote.chomp!('"')
          string << quote

        # If end of lines, empty strings, skip
        elsif scanner.skip(%r/\A\s*?$/)
          string.chomp << scanner.getch unless scanner.eos?
          process_property(property, string)

        # If a key:value pair, lets write it to memory
        elsif scanner.scan(%r/#{@separator}/im)
          if property.empty?
            property = string.strip
            string.slice!(0, string.length)
          else
            raise_error
          end
        else
          tmp = scanner.scan_until(%r/([\n"#{@separator}] | \z | \\[\[\]#{@separator}"])/mx)
          raise_error if tmp.nil?

          len = scanner[1].length
          tmp.slice!(tmp.length - len, len)

          scanner.pos = scanner.pos - len
          string << tmp
        end
      end

      process_property(property, string)
    end

    def process_property(section, value)
      value.chomp!
      return if section.empty? and value.empty?
      return if value.sub!(%r/\\\s*\z/, '')

      # all whitespace be gone!
      section.strip!
      value.strip!

      # if there is no section to set, we need to raise an error, Raise a glass? I kid, I kid
      raise_error if section.empty?

      # set the section property value with an unescaped version of the value
      current_section[section.dup] = value.dup unless section_exists?(section)

      section.slice!(0, section.length)
      value.slice!(0, value.length)
      nil
    end

    def first_line_empty?
      File.readlines(@file).first.chomp.empty?
    end

    def raise_error(msg = 'Error processing line')
      raise Error, "#{msg}: #{@_current_line}"
    end

    def current_section
      @_section ||= @config['header']
    end

    def section_exists?(key)
      @config.has_key? key
    end

    def unescape_value(value)
      value = value.to_s
      value.gsub!(%r/\\[0nrt\\]/) { |char|
        case char
        when '\0';   "\0"
        when '\n';   "\n"
        when '\r';   "\r"
        when '\t';   "\t"
        when '\\\\'; "\\"
        end
      }
      typecast value
    end

    def typecast(value)
      case value
        when /^\s*$/                                        then nil
        when /^-?(?:\d|[1-9]\d+)$/                          then Integer(value)
        when /^-?(?:\d|[1-9]\d+)(?:\.\d+)?(?:e[+-]?\d+)?$/i then Float(value)
        when /true/i                                        then true
        when /false/i                                       then false
        else                                                     value
      end
    end

    def escape_value(value)
      value = value.to_s.dup
      value.gsub!(%r/\\([0nrt])/, '\\\\\1')
      value.gsub!(%r/\n/, '\n')
      value.gsub!(%r/\r/, '\r')
      value.gsub!(%r/\t/, '\t')
      value.gsub!(%r/\0/, '\0')

      typecast value
    end

end

它用于解析看起来像这样的文件:

[header]
project: Programming Test
budget : 4.5
accessed :205

[meta data]
description : This is a tediously long description of the Hello World
  project. Tedious isn't the right word, but
  it's the first word that comes to mind.

correction text: I meant 'moderately,' not 'tediously,' above.

[ trailer ]
budget:all out of budget.

当解析器运行并进入“元数据”部分时,它会处理描述的键,但似乎无法处理多个行值,我无法弄清楚原因。它不断尝试将新行作为键处理而不是部分值。

我的正则表达式技巧不是最好的,但这应该有效...

1 个答案:

答案 0 :(得分:0)

您可能需要通过在正则表达式模式的右侧添加m来启用多行模式。

文档:http://www.ruby-doc.org/core-2.1.0/Regexp.html

编辑 - 顺便说一句,使用解析器库可能会更容易,这使得编写此类事物变得更加简单。我真的很喜欢Parslet(http://kschiess.github.io/parslet/get-started.html)但Treetop(http://treetop.rubyforge.org/)也很受欢迎。