我使用带有spark 2.3的python 3.6来执行分类任务。我使用StringIndexer/OneHotEncoder
和VectorAssembler
创建了功能。
即使我的功能不同,所有原始预测都具有相同的概率。这可能是一个错误吗?
root
|-- features: vector (nullable = true)
|-- rawPrediction: vector (nullable = true)
|-- label: double (nullable = false)
|-- prediction: double (nullable = false)
真实预测行:
"(12083,[0,2,26,359,750,2455,2456,2457,2464,2570,2573,2600,3151,4604,4837,6755,8700,10356,12065,12067,12071,12078],[1.0,1.0,1.0,1.0,1.0,1.0,90.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,45.0,1.0,1.0])","[2.020421840532351,-2.020421840532351]",0.0,0.0
"(12083,[0,2,26,359,750,2455,2456,2457,2465,2571,2574,2622,3496,4604,5190,6755,8335,9045,10356,12063,12065,12067,12071,12076],[1.0,1.0,1.0,1.0,1.0,1.0,559.83,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,46.0,1.0,1.0])","[2.020421840532351,-2.020421840532351]",0.0,0.0
虚假预测行:
"(12083,[3,24,343,598,2455,2456,2463,2464,2571,2573,2576,2787,4604,4659,6508,8335,8336,10206,12063,12064,12066,12067,12073,12075],[1.0,1.0,1.0,1.0,1.0,646.21,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,87.0,1.0,1.0])","[0.5697317865812035,-0.5697317865812035]",1.0,0.0
"(12083,[3,36,378,699,2455,2456,2457,2469,2571,2573,2576,2814,4604,4761,6535,8335,8363,10302,12063,12064,12066,12067,12068,12075],[1.0,1.0,1.0,1.0,1.0,923.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0])","[0.5697317865812035,-0.5697317865812035]",1.0,0.0
所有预测都是0.0,分类器根本无法预测正类。当标签为1.0时,rawPrediction为[0.5697317865812035,-0.5697317865812035]
,当label为0.0时,rawPrediction为[2.020421840532351,-2.020421840532351]