我正在对数据集进行逻辑回归,看起来一切正常但是当我打算输出混淆矩阵时,我得到了错误,我不知道如何处理。
import org.apache.spark.mllib.evaluation.MulticlassMetrics
val predictionAndLabels = results.select($"prediction", $"label").as[(Double, Double)].rdd
val metrics = new MulticlassMetrics(predictionAndLabels)
println("Confusion matrix:")
println(metrics.confusionMatrix)
记录消息:
root
|-- ORDER_QUANTITY: integer (nullable = true)
|-- IS_BOUGHT: integer (nullable = true)
|-- CUSTOMER_ID: long (nullable = true)
|-- SNAPSHOT_DAY: string (nullable = true)
|-- WEEK_DAY: string (nullable = true)
|-- DEVICE_TYPE: string (nullable = true)
|-- HIT_TIME: string (nullable = true)
|-- MARKETPLACE: string (nullable = true)
|-- ASIN: string (nullable = true)
|-- VEL: string (nullable = true)
|-- GL_PRODUCT_GROUP: string (nullable = true)
|-- IS_FT: string (nullable = true)
|-- INSTOCK_STATUS: string (nullable = true)
|-- PDD_WD: string (nullable = true)
|-- FT_DAYS: string (nullable = true)
|-- IN_STOCK_QUANTITY: string (nullable = true)
|-- ASIN_PRICE: double (nullable = true)
|-- PRICE_GAP: double (nullable = true)
|-- IS_DISCOUNT: string (nullable = true)
logRegDataAll: org.apache.spark.sql.DataFrame = [label: int, DEVICE_TYPE: string ... 10 more fields]
logRegData: org.apache.spark.sql.DataFrame = [label: int, DEVICE_TYPE: string ... 10 more fields]
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors
DeviceTypeIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_703e2f28bf96
MarketplaceIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_4bd47e3e31c5
VelocityIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_744315e59c01
GLIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_30a9705e2305
FTIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_fb2e7ec8b38c
InStockIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_15ceee49c6a9
PDDIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_a3987fcecd10
InStockQtyIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_9c0bc369a617
IsDicountIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_cf2902b30b63
DeviceTypeEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_5560566be7cb
MarketplaceEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_d2c6ca94f073
VelocityEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_0f1f237e9700
GLEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_70baf14c780a
FTEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_bb3312ac9c1e
InStockEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_f273d6b316b6
PDDEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_a663d8560283
InStockQtyEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_8300bb250ef0
IsDiscountEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_f5eed05b0391
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_474029a89693
training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, DEVICE_TYPE: string ... 10 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, DEVICE_TYPE: string ... 10 more fields]
import org.apache.spark.ml.Pipeline
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_cf2a6574a539
pipeline: org.apache.spark.ml.Pipeline = pipeline_2be53eb735dc
model: org.apache.spark.ml.PipelineModel = pipeline_2be53eb735dc
results: org.apache.spark.sql.DataFrame = [label: int, DEVICE_TYPE: string ... 32 more fields]
import org.apache.spark.mllib.evaluation.MulticlassMetrics
predictionAndLabels: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[1413] at rdd at <console>:292
metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@f92e2af
错误:
Confusion matrix:
16/12/28 15:15:43 ERROR Executor: Exception in task 2.0 in stage 771.0 (TID 1651)
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Unseen label: 4500.
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170)
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166)
... 15 more
16/12/28 15:15:43 ERROR TaskSetManager: Task 2 in stage 771.0 failed 1 times; aborting job
16/12/28 15:15:43 ERROR Executor: Exception in task 0.0 in stage 771.0 (TID 1649)
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Unseen label: 3583.
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170)
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166)
... 15 more
16/12/28 15:15:43 ERROR Executor: Exception in task 1.0 in stage 771.0 (TID 1650)
org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Unseen label: 8710.
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170)
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166)
... 15 more
org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 771.0 failed 1 times, most recent failure: Lost task 2.0 in stage 771.0 (TID 1651, localhost): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Unseen label: 4500.
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170)
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166)
... 15 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1899)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1913)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:745)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:744)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:744)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.tpByClass$lzycompute(MulticlassMetrics.scala:48)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.tpByClass(MulticlassMetrics.scala:44)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.labels$lzycompute(MulticlassMetrics.scala:223)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.labels(MulticlassMetrics.scala:223)
at org.apache.spark.mllib.evaluation.MulticlassMetrics.confusionMatrix(MulticlassMetrics.scala:68)
... 159 elided
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Unseen label: 4500.
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170)
at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166)
... 15 more
编辑:
当我不将数据拆分为训练和测试集时,我没有任何错误。如何避免管道配件出现“看不见的标签”错误?
答案 0 :(得分:0)
我通过设置异常处理来避免错误。
val DeviceTypeIndexer = new StringIndexer().setInputCol("DEVICE_TYPE").setOutputCol("DeviceTypeIndex").setHandleInvalid("skip")