train=pd.read_csv("C:\\Users\\User\\Documents\\data sets\\train.csv")
test=pd.read_csv("C:\\Users\\User\\Documents\\data sets\\test.csv")
import numpy as np
frames=[train,test]
data=pd.concat(frames)
print(data.shape)
data.head()
data.isnull().any()
data.fillna(999,inplace=True)
data.head(20)
data.Age[data["Age"]=="0-17"]="15"
data["Age"].head(10)
data.Age[data["Age"]=="18-25"]="21"
data.Age[data["Age"]=="26-35"]="30"
data.Age[data["Age"]=="36-45"]="40"
data.Age[data["Age"]=="46-50"]="48"
data.Age[data["Age"]=="51-55"]="53"
data.Age[data["Age"]=="55+"]="60"
data.Stay_In_Current_City_Years[data["Stay_In_Current_City_Years"]=="4+"]
="4"
data["Age"]=data["Age"].astype(int)
data["Stay_In_Current_City_Years"]=data["Stay_In_Current_City_Years"].
astype(int)
data.dtypes
data["Marital_Status"]=data["Marital_Status"].astype(int)
data["Occupation"]=data["Occupation"].astype(int)
data["Product_Category_1"]=data["Product_Category_1"].astype(int)
data["Product_Category_1"]=data["Product_Category_1"].astype(int)
data["Product_Category_2"]=data["Product_Category_2"].astype(float)
data["Product_Category_3"]=data["Product_Category_3"].astype(float)
data["Purchase"]=data["Purchase"].astype(float)
sex=pd.get_dummies(data["Gender"]).iloc[:,1:]
data1=pd.concat([data,sex],axis=1)
city=pd.get_dummies(data["City_Category"]).iloc[:,1:]
data1=pd.concat([data,sex,city],axis=1)
# cross validation and creating the features and the target variable
from sklearn.cross_validation import train_test_split
y=data1["Purchase"]
x=data1[["Age","City_Category","Gender","Marital_Status","Occupation",
"Product_Category_1","Product_Category_2","Product_Category_3","Product_ID"
,"Stay_In_Current_City_Years","User_ID","M","B","C"]]
x_train,x_test,y_train,y_test=train_test_split(x,y)
# building the regration
from sklearn import linear_model
reg=linear_model.LinearRegression()
reg.fit(x_train,y_train)
但是我继续得到这个:
ValueError: could not convert string to float: 'P00100642'
是什么意思?我还需要转换为整数才能运行回归吗? 我该如何解决? 谢谢:)
答案 0 :(得分:1)
机器学习算法仅采用数字数据。 Purchase_ID
列没有数字数据,因为它以' P '开头。您正在尝试通过它,因为这样会导致错误。
注意值中的模式,您将看到每个条目均以“ P00
”开头。由于它是一个字符串,因此可以用 nothing 代替它。
尝试一下:
data['Product_ID'] = data['Product_ID'].str.replace('P00', '')
之后,您可以使用StandardScaler
缩小值。