我目前正在Pyspark中实施Gradientboost分类模型。基于kaggle数据集我在拟合管道后的当前最终列是
我现在正在尝试PARAMGRIDBUILD进行参数调整。这是我的参数网格构建代码
param_grid=ParamGridBuilder.addGrid(gradboost.maxDepth,[2,3,4]).addGrid(gradboost.minInfoGain,[0.0, 0.1, 0.2, 0.3]).addGrid(gradboost.stepSize,[0.05, 0.1, 0.2, 0.4]).build()
我的错误
****param_grid=ParamGridBuilder.addGrid(gradboost.maxDepth,[2,3,4]).addGrid(gradboost.minInfoGain,[0.0, 0.1, 0.2, 0.3]).addGrid(gradboost.stepSize,[0.05, 0.1, 0.2, 0.4]).build()
TypeError: addGrid() missing 1 required positional argument: 'values'****
之前我没有使用过Paramgridbuild。这个数组值是否代表我当前数据帧的每一列?请帮我弄清楚错误并给出使用这个值的基本概念。这是我的完整代码
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer,VectorIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
spark=SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("Gradientboostapp").enableHiveSupport().getOrCreate()
data= spark.read.csv("C:/Users/codemen/Desktop/Timeseries Analytics/liver_patient.csv",header=True, inferSchema=True)
#data.show()
print(data.count())
#data.printSchema()
print("After deleting null values")
data=data.na.drop()
print(data.count())
data.show(5)
gender_column=data.columns[1:2]
#print(categorical_column)
stringindexstage=[StringIndexer(inputCol=c,outputCol='genderindexed')for c in gender_column]
#print(stringindexstage)
stringindexstage=stringindexstage+[StringIndexer(inputCol='category',outputCol='classlabel')]
for x in stringindexstage:
data=x.fit(data).transform(data)
data.show(3)
#data.show(3)
#print ("Type of",type(stringindexstage))
onehotencoderstage=[OneHotEncoder(inputCol='genderindexed', outputCol='onehot'+c) for c in gender_column]
for onehot in onehotencoderstage:
data=onehot.transform(data)
data.show()
#vector assembler
print("data current")
data.show(3)
feature_column=['Age','onehotGender','Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens',
'Albumin', 'Albumin_and_Globulin_Ratio']
print(feature_column)
#Vector Assembler stage
vectorassmblestage=[VectorAssembler(inputCols=feature_column,outputCol="features")]
#pipeline model
#allstages=stringindexstage+onehotencoderstage+vectorassmblestage
#for i in allstages:
#
pipelinestage=Pipeline(stages=vectorassmblestage)
#
# #fitting variable
pipelinemodel=pipelinestage.fit(data)
#
# #Transform Data
#
finalcolumns=feature_column+['features','classlabel']
#
dataframe=pipelinemodel.transform(data).select(finalcolumns)
print("final column print")
dataframe.show(5)
#splitting data into train test
(traindata, testdata)=dataframe.randomSplit([0.7,0.3],seed=1234)
#gradientboosting
gradboost=GBTClassifier(featuresCol='features',labelCol='classlabel',maxIter=10)
#parameter tuning
param_grid=ParamGridBuilder.addGrid(gradboost.maxDepth,[2,3,4]).addGrid(gradboost.minInfoGain,[0.0, 0.1, 0.2, 0.3]).addGrid(gradboost.stepSize,[0.05, 0.1, 0.2, 0.4]).build()
##Evaluation
print("Evaluation stage")
evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction')
#crossvalidation state
print("cross validation stage")
crossvalidation=CrossValidator(estimator=gradboost,estimatorParamMaps=param_grid,evaluator=evaluator)
crossvalidateData=crossvalidation.fit(dataframe)
##prediction on Training Data
print("Prediction in Training data ....")
predictTrain=crossvalidateData.transform(traindata)
predictTrain.show(10)
提前谢谢
答案 0 :(得分:0)
对于初学者来说,您似乎需要用括号调用ParamGridBuilder()
所以;
param_grid = ParamGridBuilder() \
.addGrid(...)