我是Scala和Scalding的新手,在处理我的第一个Job时,我在向val分配管道时遇到NullPointerException。在没有中间变量的情况下链接到.write()
的完全相同的作业按预期完成。
StackOverflow_Works
和StackOverflow_NPE
工作不相同吗?仍然遇到问题的缩减示例:
import com.twitter.scalding._
case class Line(timestamp:Float, sessionId:String)
class UserSession () {
var sessionId = "__default_session_id__"
var firstTimestamp = 0.0f
var lastTimestamp = 0.0f
override def toString() = {
("UserSession(" +
"sessionId:%s, " +
"firstTimestamp:%f, " +
"lastTimestamp:%f" +
")").format(
sessionId,
firstTimestamp,
lastTimestamp
)
}
}
object Predicates {
def parseLogLine(line:String): Line = {
val split_line = line.split("\\t")
new Line(
timestamp = split_line(0).toFloat,
sessionId = split_line(1)
)
}
def foldLeftSessions(sesh:UserSession, line:Line): UserSession = {
sesh.sessionId = line.sessionId
if (sesh.firstTimestamp == 0.0f) {
sesh.firstTimestamp = line.timestamp
}
sesh.lastTimestamp = line.timestamp
sesh
}
}
class StackOverflow_NPE(args:Args) extends Job(args) {
import Predicates._
val inputFile = args("input")
val outputFile = args("output")
val beaconLog:TypedPipe[String] = TypedPipe.from(TextLine(inputFile))
val sessions = beaconLog
.map(parseLogLine)
.groupBy { b:Line => b.sessionId }
.sortBy { b:Line => b.timestamp }
.foldLeft(new UserSession)(foldLeftSessions)
.values
sessions.write(TypedTsv(outputFile))
}
class StackOverflow_Works(args:Args) extends Job(args) {
import Predicates._
val inputFile = args("input")
val outputFile = args("output")
val beaconLog:TypedPipe[String] = TypedPipe.from(TextLine(inputFile))
beaconLog
.map(parseLogLine)
.groupBy { b:Line => b.sessionId }
.sortBy { b:Line => b.timestamp }
.foldLeft(new UserSession)(foldLeftSessions)
.values
.write(TypedTsv(outputFile))
}
输出作业StackOverflow_Works
:
(成功运行)
objc[49967]: Class JavaLaunchHelper is implemented in both /Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/bin/java and /Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/libinstrument.dylib. One of the two will be used. Which one is undefined.
17/06/01 15:41:25 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.ReduceStep$$Lambda$89/1583353301.apply(Unknown Source).getClassName as we can't find the class
17/06/01 15:41:25 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.ReduceStep$$Lambda$81/252553541.apply(Unknown Source).getClassName as we can't find the class
17/06/01 15:41:25 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipeFactory$$$Lambda$62/1212116343.apply(Unknown Source).getClassName as we can't find the class
17/06/01 15:41:25 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipe$$Lambda$87/891786282.apply(Unknown Source).getClassName as we can't find the class
17/06/01 15:41:25 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipeFactory$$$Lambda$62/1212116343.apply(Unknown Source).getClassName as we can't find the class
17/06/01 15:41:25 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipe$$Lambda$87/891786282.apply(Unknown Source).getClassName as we can't find the class
17/06/01 15:41:25 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipeFactory$$$Lambda$62/1212116343.apply(Unknown Source).getClassName as we can't find the class
17/06/01 15:41:25 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/06/01 15:41:26 INFO util.HadoopUtil: resolving application jar from found main method on: com.twitter.scalding.Tool$
17/06/01 15:41:26 INFO planner.HadoopPlanner: using application jar: /___/.ivy2/cache/com.twitter/scalding-core_2.12/jars/scalding-core_2.12-0.17.0.jar
17/06/01 15:41:26 INFO property.AppProps: using app.id: 1B171A71FA8C4402B6BE83737977F803
17/06/01 15:41:26 INFO util.Version: Concurrent, Inc - Cascading 2.6.1
17/06/01 15:41:26 INFO flow.Flow: [StackOverflow_Works] starting
17/06/01 15:41:26 INFO flow.Flow: [StackOverflow_Works] source: Hfs["TextLine[['offset', 'line']->[ALL]]"]["../test-data/stackoverflow.log"]
17/06/01 15:41:26 INFO flow.Flow: [StackOverflow_Works] sink: Hfs["TextDelimited[[0]]"]["../test-data/stackoverflow-output"]
17/06/01 15:41:26 INFO flow.Flow: [StackOverflow_Works] parallel execution is enabled: false
17/06/01 15:41:26 INFO flow.Flow: [StackOverflow_Works] starting jobs: 1
17/06/01 15:41:26 INFO flow.Flow: [StackOverflow_Works] allocating threads: 1
17/06/01 15:41:26 INFO flow.FlowStep: [StackOverflow_Works] starting step: (1/1) ...data/stackoverflow-output
17/06/01 15:41:26 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
17/06/01 15:41:26 INFO mapred.FileInputFormat: Total input paths to process : 1
17/06/01 15:41:27 INFO mapred.LocalJobRunner: OutputCommitter set in config null
17/06/01 15:41:27 INFO flow.FlowStep: [StackOverflow_Works] submitted hadoop job: job_local918733190_0001
17/06/01 15:41:27 INFO flow.FlowStep: [StackOverflow_Works] tracking url: http://localhost:8080/
17/06/01 15:41:27 INFO mapred.LocalJobRunner: OutputCommitter is org.apache.hadoop.mapred.FileOutputCommitter
17/06/01 15:41:27 INFO mapred.LocalJobRunner: Waiting for map tasks
17/06/01 15:41:27 INFO mapred.LocalJobRunner: Starting task: attempt_local918733190_0001_m_000000_0
17/06/01 15:41:27 WARN mapreduce.Counters: Group org.apache.hadoop.mapred.Task$Counter is deprecated. Use org.apache.hadoop.mapreduce.TaskCounter instead
17/06/01 15:41:27 INFO mapred.Task: Using ResourceCalculatorPlugin : null
17/06/01 15:41:27 INFO io.MultiInputSplit: current split input path: file:/___/test-data/stackoverflow.log
17/06/01 15:41:27 INFO mapred.MapTask: Processing split: cascading.tap.hadoop.io.MultiInputSplit@401c79fd
17/06/01 15:41:27 WARN mapreduce.Counters: Counter name MAP_INPUT_BYTES is deprecated. Use FileInputFormatCounters as group name and BYTES_READ as counter name instead
17/06/01 15:41:27 INFO mapred.MapTask: numReduceTasks: 1
17/06/01 15:41:27 INFO mapred.MapTask: Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
17/06/01 15:41:27 INFO mapred.MapTask: io.sort.mb = 100
17/06/01 15:41:27 INFO mapred.MapTask: data buffer = 79691776/99614720
17/06/01 15:41:27 INFO mapred.MapTask: record buffer = 262144/327680
17/06/01 15:41:27 INFO hadoop.FlowMapper: cascading version: 2.6.1
17/06/01 15:41:27 INFO hadoop.FlowMapper: child jvm opts: -Xmx200m
17/06/01 15:41:27 INFO hadoop.FlowMapper: sourcing from: Hfs["TextLine[['offset', 'line']->[ALL]]"]["../test-data/stackoverflow.log"]
17/06/01 15:41:27 INFO hadoop.FlowMapper: sinking to: GroupBy(com.twitter.scalding.TextLine(../test-data/stackoverflow.log)0891c647-0d6d-4c57-92a4-b53608a246e6)[by:[{1}:'key']]
17/06/01 15:41:27 INFO mapred.MapTask: Starting flush of map output
17/06/01 15:41:27 INFO mapred.MapTask: Finished spill 0
17/06/01 15:41:27 INFO mapred.Task: Task:attempt_local918733190_0001_m_000000_0 is done. And is in the process of commiting
17/06/01 15:41:27 INFO mapred.LocalJobRunner: file:/___/test-data/stackoverflow.log:0+20350
17/06/01 15:41:27 INFO mapred.Task: Task 'attempt_local918733190_0001_m_000000_0' done.
17/06/01 15:41:27 INFO mapred.LocalJobRunner: Finishing task: attempt_local918733190_0001_m_000000_0
17/06/01 15:41:27 INFO mapred.LocalJobRunner: Map task executor complete.
17/06/01 15:41:27 WARN mapreduce.Counters: Group org.apache.hadoop.mapred.Task$Counter is deprecated. Use org.apache.hadoop.mapreduce.TaskCounter instead
17/06/01 15:41:27 INFO mapred.Task: Using ResourceCalculatorPlugin : null
17/06/01 15:41:27 INFO mapred.LocalJobRunner:
17/06/01 15:41:27 INFO mapred.Merger: Merging 1 sorted segments
17/06/01 15:41:27 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 56102 bytes
17/06/01 15:41:27 INFO mapred.LocalJobRunner:
17/06/01 15:41:27 INFO hadoop.FlowReducer: cascading version: 2.6.1
17/06/01 15:41:27 INFO hadoop.FlowReducer: child jvm opts: -Xmx200m
17/06/01 15:41:27 INFO hadoop.FlowReducer: sourcing from: GroupBy(com.twitter.scalding.TextLine(../test-data/stackoverflow.log)0891c647-0d6d-4c57-92a4-b53608a246e6)[by:[{1}:'key']]
17/06/01 15:41:27 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[0]]"]["../test-data/stackoverflow-output"]
17/06/01 15:41:27 INFO mapred.Task: Task:attempt_local918733190_0001_r_000000_0 is done. And is in the process of commiting
17/06/01 15:41:27 INFO mapred.LocalJobRunner:
17/06/01 15:41:27 INFO mapred.Task: Task attempt_local918733190_0001_r_000000_0 is allowed to commit now
17/06/01 15:41:27 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local918733190_0001_r_000000_0' to file:/___/test-data/stackoverflow-output
17/06/01 15:41:27 INFO mapred.LocalJobRunner: reduce > reduce
17/06/01 15:41:27 INFO mapred.Task: Task 'attempt_local918733190_0001_r_000000_0' done.
17/06/01 15:41:32 INFO util.Hadoop18TapUtil: deleting temp path ../test-data/stackoverflow-output/_temporary
Process finished with exit code 0
输出作业StackOverflow_NPE
:
(爆炸)
objc[51212]: Class JavaLaunchHelper is implemented in both /Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/bin/java and /Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/libinstrument.dylib. One of the two will be used. Which one is undefined.
17/06/01 16:05:15 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.ReduceStep$$Lambda$89/1583353301.apply(Unknown Source).getClassName as we can't find the class
17/06/01 16:05:15 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.ReduceStep$$Lambda$81/252553541.apply(Unknown Source).getClassName as we can't find the class
17/06/01 16:05:15 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipeFactory$$$Lambda$62/1212116343.apply(Unknown Source).getClassName as we can't find the class
17/06/01 16:05:15 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipe$$Lambda$87/891786282.apply(Unknown Source).getClassName as we can't find the class
17/06/01 16:05:15 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipeFactory$$$Lambda$62/1212116343.apply(Unknown Source).getClassName as we can't find the class
17/06/01 16:05:15 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipe$$Lambda$87/891786282.apply(Unknown Source).getClassName as we can't find the class
17/06/01 16:05:15 WARN scalding.LineNumber$: Skipping com.twitter.scalding.typed.TypedPipeFactory$$$Lambda$62/1212116343.apply(Unknown Source).getClassName as we can't find the class
17/06/01 16:05:15 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Exception in thread "main" java.lang.Throwable: If you know what exactly caused this error, please consider contributing to GitHub via following link.
https://github.com/twitter/scalding/wiki/Common-Exceptions-and-possible-reasons#javalangnullpointerexception
at com.twitter.scalding.Tool$.main(Tool.scala:152)
at com.twitter.scalding.Tool.main(Tool.scala)
Caused by: java.lang.NullPointerException
at com.twitter.scalding.ReferencedClassFinder$.$anonfun$findReferencedClasses$5(ReferencedClassFinder.scala:48)
at com.twitter.scalding.ReferencedClassFinder$.$anonfun$findReferencedClasses$5$adapted(ReferencedClassFinder.scala:43)
at com.twitter.scalding.ReferencedClassFinder$$$Lambda$352/1011104118.apply(Unknown Source)
at scala.collection.TraversableLike$WithFilter.$anonfun$map$2(TraversableLike.scala:739)
at scala.collection.TraversableLike$WithFilter$$Lambda$346/1273879638.apply(Unknown Source)
at scala.collection.immutable.List.foreach(List.scala:389)
at scala.collection.TraversableLike$WithFilter.map(TraversableLike.scala:738)
at com.twitter.scalding.ReferencedClassFinder$.$anonfun$findReferencedClasses$4(ReferencedClassFinder.scala:43)
at com.twitter.scalding.ReferencedClassFinder$$$Lambda$349/1613514326.apply(Unknown Source)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:241)
at scala.collection.TraversableLike$$Lambda$104/95396809.apply(Unknown Source)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:32)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:29)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:191)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:241)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:238)
at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:191)
at com.twitter.scalding.ReferencedClassFinder$.findReferencedClasses(ReferencedClassFinder.scala:40)
at com.twitter.scalding.Job.reflectedClasses(Job.scala:203)
at com.twitter.scalding.Job.config(Job.scala:191)
at com.twitter.scalding.Job.executionContext(Job.scala:223)
at com.twitter.scalding.Job.buildFlow(Job.scala:231)
at com.twitter.scalding.Job.run(Job.scala:302)
at com.twitter.scalding.Tool.start$1(Tool.scala:124)
at com.twitter.scalding.Tool.run(Tool.scala:140)
at com.twitter.scalding.Tool.run(Tool.scala:68)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at com.twitter.scalding.Tool$.main(Tool.scala:148)
... 1 more
Process finished with exit code 1
东西的版本:
另请注意,我只是使用com.twitter.scalding.Tool
在IntelliJ中本地运行它们。