我是scala的新手,希望使用RegexParsers类提取一些重要功能
abstract class LogLine extends java.io.Serializable {
def app: String
}
case class AppSummary(timestamp: String, app: String, name: String, user: String, state:String, url:String, host: String, startTime: String, endTime: String, finalStatus: String) extends LogLine
case class OperSum(title: String, user: String, operation:String,target:String,result:String, app: String, container: String) extends LogLine
case object UnknownLine extends LogLine {
val app = "unknown"
}
object LogP extends RegexParsers with java.io.Serializable {
def logline: Parser[LogLine] = (
timestamp~"INFO org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary: appId="~ident
~",name="~identW
~",user="~ident
~",queue=default,state="~ident
~",trackingUrl="~url
~",appMasterHost="~ident
~".icdatacluster2,startTime="~ident
~",finishTime="~ident
~",finalStatus="~ident ^^ {
case t~_~app~_~name~_~user~_~state~_~url~_~host~_~stime~_~etime~_~finalStatus =>
AppSummary(t, app, name, user, state, url, host, stime, etime, finalStatus)
}
| timestamp~"INFO org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger: USER="~identY
~"OPERATION="~identY
~"TARGET="~identY
~"RESULT="~identY
~"APPID="~identY
~"CONTAINERID="~ident ^^ {
case t~_~user~_~operation~_~target~_~result~_~app~_~container =>
OperSum(t, user, operation, target, result, app, container)
}
)
val ident: Parser[String] = "[A-Za-z0-9_]+".r
val identY: Parser[String] ="[A-Za-z0-9_]+\\s".r
val identW: Parser[String] = "[A-Za-z0-9_ ]+".r
val timestamp: Parser[String] = "2015-[0-9][0-9]-[0-9][0-9] [0-9:,]+".r
val url: Parser[String] = "http://[a-zA-Z0-1.]+:[0-9]+/[a-zA-Z0-9_/]+".r
}
它可以在第一种情况下起作用,但不能在第二种情况下起作用,例如:以下单词: 2015-03-09 01:36:39,016 INFO org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger:USER = pwalch OPERATION = AM Released Container TARGET = SchedulerApp RESULT = SUCCESS APPID = application_1425682538854_0741 CONTAINERID = container_1425682538854_0741_01_000004 无法提取。希望某人可以帮忙
答案 0 :(得分:0)
一些可能的原因:
timestamp
和以INFO
开头的字符串之间的空格?identY
正则表达式"[A-Za-z0-9_]+\\s".r
只会匹配 AM - 所以后续匹配器都将失败