这是我的代码:
import pandas as pd
df = pd.read_csv('train.csv')
gender_dict = {"male": 1, "female": 2}
eye_color_dict = {"amber": 1, "blue": 2, "brown": 3, "gray": 4, "green": 5, "hazel": 6}
race_dict = {"black": 1, "white": 2, "middle_eastern": 3,"asian":4}
accommodation_type_dict = {"apartment": 1, "homeless": 2, "shared_residence": 3, "villa": 4, "other": 5}
education_status_dict = {"associate_degree": 1, "bachelors_degree": 2, "graduate_or_professional_degree": 3, "high_school": 4, "less_than_9th_grade": 5, "not_applicable": 6}
blood_type_dict = {"A+": 1, "A-": 2, "B+": 3, "B-": 4, "O+": 5, "O-": 6, "AB+": 7, "AB-": 8}
occupation_dict = {"agriculture": 1, "art": 2, "business": 3, "education": 4, "engineering": 5, "healthcare": 6, "unemployed": 7, "other": 8}
living_area_dict = {"suburbs": 1, "rural": 2, "urban": 3, "other": 4}
sports_engagement_dict = {"never": 1, "sometimes": 2, "seldom": 3, "regularly": 4}
favorite_music_genre_dict = {"r&b": 1, "rock": 2, "pop": 3, "country": 4, "other": 5, "edm": 6, "classical": 7}
favorite_color_dict = {"green": 1, "orange": 2, "yellow": 3, "purple": 4, "blue": 5, "pink": 6, "red": 7}
owned_car_brand_dict = {"audi": 1, "bmw": 2, "ford": 3, "honda": 4, "hyundai": 5, "kia": 6, "none": 7, "tesla": 8, "other": 9, "mitsubishi": 10}
hours_worked_each_week_dict = {"not_applicable": 1}
owns_a_pet_dict = {"yes": 1, "no": 2}
has_health_insurance_dict = {"yes": 1, "no": 2}
has_cancer_dict = {"yes": 1, "no": 2}
smokes_dict = {"yes": 1, "no": 2}
has_alzheimers_dict = {"yes": 1, "no": 2}
facial_hair_dict = {"long": 1, "short": 2, "none": 3}
diet_type_dict = {"regular": 1, "vegetarian": 2, "keto": 3, "vegan": 4, "low-carb": 5, "paleo": 6}
df['gender'] = df['gender'].map(gender_dict)
df['eye_color'] = df['eye_color'].map(eye_color_dict)
df['race'] = df['race'].map(race_dict)
df['accommodation_type'] = df['accommodation_type'].map(accommodation_type_dict)
df['education_status'] = df['education_status'].map(education_status_dict)
df['blood_type'] = df['blood_type'].map(blood_type_dict)
df['occupation'] = df['occupation'].map(occupation_dict)
df['living_area'] = df['living_area'].map(living_area_dict)
df['sports_engagement'] = df['sports_engagement'].map(sports_engagement_dict)
df['favorite_music_genre'] = df['favorite_music_genre'].map(favorite_music_genre_dict)
df['favorite_color'] = df['favorite_color'].map(favorite_color_dict)
df['owned_car_brand'] = df['owned_car_brand'].map(owned_car_brand_dict)
df['hours_worked_each_week'] = df['hours_worked_each_week'].map(hours_worked_each_week_dict)
df['owns_a_pet'] = df['owns_a_pet'].map(owns_a_pet_dict)
df['has_health_insurance'] = df['has_health_insurance'].map(has_health_insurance_dict)
df['has_cancer'] = df['has_cancer'].map(has_cancer_dict)
df['smokes'] = df['smokes'].map(smokes_dict)
df['has_alzheimers'] = df['has_alzheimers'].map(has_alzheimers_dict)
df['facial_hair'] = df['facial_hair'].map(facial_hair_dict)
df['diet_type'] = df['diet_type'].map(diet_type_dict)
import sklearn
from sklearn import svm, preprocessing
df = sklearn.utils.shuffle(df)
X = df.drop("infected", axis=1).values
X = preprocessing.scale(X)
y = df['infected'].values
test_size = 200
X_train = X[:-test_size]
y_train = y[:-test_size]
X_test = X[-test_size:]
y_test = y[-test_size:]
clf = svm.SVR(kernel="linear")
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
for X,y in zip(X_test, y_test):
print(f"Model: {clf.predict([X])[0]}, Actual: {y}")
我遇到价值错误:
ValueError:输入包含NaN,无穷大或对于dtype('float64')而言太大的值。
它告诉我:
<ipython-input-1-8b8c4c2d113b> in <module>
62
63 clf = svm.SVR(kernel="linear")
---> 64 clf.fit(X_train,y_train)
65
66 clf.score(X_test,y_test)`
我正在使用jupyter-notebook,我是sklearn和ml的新手 我附上了CSV文件,谢谢您的帮助
答案 0 :(得分:0)
看起来hours_worked_each_week
列包含空值。
如果删除该列,是否会得到相同的错误:
X = df.drop(['infected', 'hours_worked_each_week'], axis=1).values
或者,您可以将null替换为0
df.fillna(0,inplace=True)