使用Logstash,如何基于在每一行上匹配相同的时间戳来组合行?

时间:2014-05-15 02:03:09

标签: marklogic logstash

MarkLogic日志文件通过将具有相同时间戳的行打印到毫秒来表示多行。您可以在此代码段中看到:

2014-05-09 18:47:33.844 Info: Dev_Xdbc: [/file-store/get-file.xqy] url=/2.16.840.1.113883.3.21/d3c787ef-2244-48e4-a8a9-82ae5e7ad47a-296 tenant=perform001
2014-05-09 18:47:34.101 Notice: Dev_Xdbc: XDMP-MODNOTFOUND: (err:XQST0059) Module /2.6.0/utils/no-fn.xqy not found
2014-05-09 18:47:34.101 Notice: Dev_Xdbc:  [1.0-ml]

所以这里以2014-05-09 18:47:34.101开头的两行是同一条日志消息的一部分。

我不知道如何告诉Logstash将以相同字符串开头的行组合在一起。多行编解码器/过滤器似乎不支持以这种方式组合线。有没有办法做到这一点?

1 个答案:

答案 0 :(得分:3)

这似乎是一个有趣的问题,所以我为它编写了一个插件,基于多线插件,但是有一些复杂性。核心logstash代码中存在一个错误,需要为插件修复。

将以下内容解压缩到lib / filters / related.rb

# encoding: utf-8
require "logstash/filters/base"
require "logstash/namespace"
require "set"
#
# This filter will collapse multiline messages from a single source into one Logstash event.
# if they are related based on a pattern defined.  Two events are related if the pattern
# match returns the same thing for consecutive events.
# 
# The config looks like this:
#
#     filter {
#       related {
#         type => "type"
#         pattern => "^%{TIMESTAMP:time}"
#         capture => "TIMESTAMP:time"
#       }
#     }
# 
# The `pattern` is the pattern that is used to match the lines
# The `capture` is the named capture that has to match between the lines
#
class LogStash::Filters::Related < LogStash::Filters::Base

  config_name "related"
  milestone 1

  # The regular expression to match.
  config :pattern, :validate => :string, :required => true
  config :capture, :validate => :string, :required => true

  # The stream identity is how the multiline filter determines which stream an
  # event belongs to. This is generally used for differentiating, say, events
  # coming from multiple files in the same file input, or multiple connections
  # coming from a tcp input.
  #
  # The default value here is usually what you want, but there are some cases
  # where you want to change it. One such example is if you are using a tcp
  # input with only one client connecting at any time. If that client
  # reconnects (due to error or client restart), then logstash will identify
  # the new connection as a new stream and break any multiline goodness that
  # may have occurred between the old and new connection. To solve this use
  # case, you can use "%{@source_host}.%{@type}" instead.
  config :stream_identity , :validate => :string, :default => "%{host}.%{path}.%{type}"

  # Logstash ships by default with a bunch of patterns, so you don't
  # necessarily need to define this yourself unless you are adding additional
  # patterns.
  #
  # Pattern files are plain text with format:
  #
  #     NAME PATTERN
  #
  # For example:
  #
  #     NUMBER \d+
  config :patterns_dir, :validate => :array, :default => []

  # Detect if we are running from a jarfile, pick the right path.
  @@patterns_path = Set.new
  if __FILE__ =~ /file:\/.*\.jar!.*/
    @@patterns_path += ["#{File.dirname(__FILE__)}/../../patterns/*"]
  else
    @@patterns_path += ["#{File.dirname(__FILE__)}/../../../patterns/*"]
  end

  public
  def initialize(config = {})
    super

    @threadsafe = false

    # This filter needs to keep state.
    @types = Hash.new { |h,k| h[k] = [] }
    @pending_unmatched = Hash.new
    @pending = Hash.new
    @previous = Hash.new
  end # def initialize

  public
  def register
    require "grok-pure" # rubygem 'jls-grok'

    @grok = Grok.new

    @patterns_dir = @@patterns_path.to_a + @patterns_dir
    @patterns_dir.each do |path|
      # Can't read relative paths from jars, try to normalize away '../'
      while path =~ /file:\/.*\.jar!.*\/\.\.\//
        # replace /foo/bar/../baz => /foo/baz
        path = path.gsub(/[^\/]+\/\.\.\//, "")
      end

      if File.directory?(path)
        path = File.join(path, "*")
      end

      Dir.glob(path).each do |file|
        @logger.info("Grok loading patterns from file", :path => file)
        @grok.add_patterns_from_file(file)
      end
    end

    @grok.compile(@pattern)

    @logger.debug("Registered multiline plugin", :type => @type, :config => @config)
  end # def register

  public
  def filter(event)
    key = event.sprintf(@stream_identity)

    ## if there's anything pending unmatched, we need to push it out
    ## and then push the event back on the filter queue (yield)
    if @pending_unmatched[key]
      @logger.info("Related", :unmatched => key)
    clone = event.clone
        event.overwrite(@pending_unmatched[key])
        @pending_unmatched.delete(key)
    yield clone
    return
    end
    return unless filter?(event);

    if event["message"].is_a?(Array)
      match = @grok.match(event["message"].first)
    else
      match = @grok.match(event["message"])
    end
    pending = @pending[key]

    @logger.debug("Related", :pattern => @pattern, :message => event["message"],
                  :match => match, :capture => @capture)

    if !match 
      if pending
        @pending_unmatched[key] = event.clone
        event.overwrite(pending)
    @pending.delete(key)
    collapse_event!(event)
        filter_matched(event)
      end
      return
    end
    ## from here out, we've matched
    if pending
      if match.captures[@capture] == @previous[key]
          pending.append(event)
          pending.tag "related"
      event.cancel
      else
        @pending[key] = event.clone
        @previous[key] = match.captures[@capture]
        event.overwrite(pending)
    collapse_event!(event)
        filter_matched(event)
      end
    else
       @pending[key] = event
       @previous[key] = match.captures[@capture]
       event.cancel
    end
  end # def filter

  # Flush any pending messages. This is generally used for unit testing only.
  #
  # Note: flush is disabled now; it is preferable to use the multiline codec.
  public
  def __flush
    events = []
    @pending.each do |key, value|
      value.uncancel
      events << value
    end
    @pending.clear
    return events
  end # def flush

  def collapse_event!(event)
    event["message"] = event["message"].join("\n") if event["message"].is_a?(Array)
    event["@timestamp"] = event["@timestamp"].first if event["@timestamp"].is_a?(Array)
    event
  end
end # class LogStash::Filters::Related

然后修改lib / event.rb以添加注释行

public
def overwrite(event)
  @data = event.to_hash
  #convert timestamp if it is a String
  if @data[TIMESTAMP].is_a?(String)
    @data[TIMESTAMP] = LogStash::Time.parse_iso8601(@data[TIMESTAMP])
  end
  # add the line below.  This is needed because if we are overwriting data["message"]
  # with an array, we need to regenerate the accessors otherwise it uses the message
  # before it was overwritten
  @accessors = LogStash::Util::Accessors.new(@data)
end