大家好我有以下问题。当我调用splitAndTest时,我将1493个数据集放入函数中,但test_data_pair中的数据集总量与rdd中的数据集总数相同。我不知道为什么会出现这种情况,因为当我从函数中测试减去部分时它完美地工作:/
def splitAndTest(rdd,num):
train_list = rdd.takeSample(False,num,random.randint(0,100))
train_data_pair = sc.parallelize(train_list)
test_data_pair = rdd.subtract(train_data_pair)
print train_data_pair.count()
print test_data_pair.count()
print rdd.count()
start = time.time()
model = NaiveBayes.train(train_data_pair,1.0)
end = time.time()
train_time = end -start
prediction_and_labels = test_data_pair.map(lambda point: (model.predict(point.features), point.label))
correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual)
accuracy = correct.count() / float(test_data_pair.count())
return (train_time,accuracy)