从Pandas DataFrame进行预测

时间:2018-09-15 09:43:55

标签: python pandas dataframe classification predict

总体而言,我对DataScience / Pandas来说是新手。我主要关注this,并且可以使用不同的分类器来使其工作。

import pandas as pd
import src.helper as helper
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

# Headings
headings = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing',
            'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
            'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
            'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

# Load the data
shrooms = pd.read_csv('data/shrooms_no_header.csv', names=headings, converters={"header": float})

# Replace the ? in 'stalk-root' with 0
shrooms.loc[shrooms['stalk-root'] == '?', 'stalk-root'] = np.nan
shrooms.fillna(0, inplace=True)

# Remove columns with only one unique value
for col in shrooms.columns.values:
    if len(shrooms[col].unique()) <= 1:
        print("Removing column {}, which only contains the value: {}".format(col, shrooms[col].unique()[0]))
        shrooms.drop(col, axis=1, inplace=True)

# Col to predict later
col_predict = 'class'

# Binary Encoding
all_cols = list(shrooms.columns.values)
all_cols.remove(col_predict)
helper.encode(shrooms, [col_predict])

# Expand Shrooms DataFrame to Binary Values
helper.expand(shrooms, all_cols)

# Remove the class we want to predict
x_all = list(shrooms.columns.values)
x_all.remove(col_predict)

# Set Train/Test ratio
ratio = 0.7

# Split the DF
df_train, df_test, X_train, Y_train, X_test, Y_test = helper.split_df(shrooms, col_predict, x_all, ratio)

# Try different classifier
# TODO: Batch Use to compare
classifier = GradientBoostingClassifier(n_estimators=1000)

# TODO: Optimize Hyperparamter (where applicable)

# Time the training
timer_start = time.process_time()
classifier.fit(X_train, Y_train)
timer_stop = time.process_time()
time_diff = timer_stop - timer_start

# Get the score
score_train = classifier.score(X_train, Y_train)
score_test = classifier.score(X_test, Y_test)

print('Train Score {}, Test Score {}, Time {}'.format(score_train, score_test, time_diff))

# TODO: Test a manual DataFrame

“帮助程序”是我不太了解的功能,但是它们起作用:

import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt


def split_df(df, y_col, x_cols, ratio):
    """
    This method transforms a dataframe into a train and test set, for this you need to specify:
    1. the ratio train : test (usually 0.7)
    2. the column with the Y_values
    """
    mask = np.random.rand(len(df)) < ratio
    train = df[mask]
    test = df[~mask]

    y_train = train[y_col].values
    y_test = test[y_col].values
    x_train = train[x_cols].values
    x_test = test[x_cols].values

    return train, test, x_train, y_train, x_test, y_test


def encode(df, columns):
    for col in columns:
        le = LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)

        col_values = list(df[col].values)
        le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed


def expand(df, list_columns):
    for col in list_columns:
        colvalues = df[col].unique()
        for colvalue in colvalues:
            newcol_name = "{}_is_{}".format(col, colvalue)
            df.loc[df[col] == colvalue, newcol_name] = 1
            df.loc[df[col] != colvalue, newcol_name] = 0
    df.drop(list_columns, inplace=True, axis=1)


def correlation_to(df, col):
    correlation_matrix = df.corr()
    correlation_type = correlation_matrix[col].copy()
    abs_correlation_type = correlation_type.apply(lambda x: abs(x))
    desc_corr_values = abs_correlation_type.sort_values(ascending=False)
    y_values = list(desc_corr_values.values)[1:]
    x_values = range(0, len(y_values))
    xlabels = list(desc_corr_values.keys())[1:]
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.bar(x_values, y_values)
    ax.set_title('The correlation of all features with {}'.format(col), fontsize=20)
    ax.set_ylabel('Pearson correlatie coefficient [abs waarde]', fontsize=16)
    plt.xticks(x_values, xlabels, rotation='vertical')
    plt.show()

我想进行一次“手动”测试,例如输入x个属性并以此为基础进行预测。

例如,我对一个DataFrame进行如下硬编码:

manual = pd.DataFrame({
    "cap-shape": ["x"],
    "cap-surface": ["s"],
    "cap-color": ["n"],
    "bruises": ["f"],
    "odor": ["n"],
    "gill-attachment": ["a"],
    "gill-spacing": ["c"],
    "gill-size": ["b"],
    "gill-color": ["y"],
    "stalk-shape": ["e"],
    "stalk-root": ["?"],
    "stalk-surface-above-ring": ["s"],
    "stalk-surface-below-ring": ["s"],
    "stalk-color-above-ring": ["o"],
    "stalk-color-below-ring": ["o"],
    "veil-type": ["p"],
    "veil-color": ["o"],
    "ring-number": ["o"],
    "ring-type": ["p"],
    "spore-print-color": ["o"],
    "population": ["c"],
    "habitat": ["l"]
})

我将如何应用相同的编码?我的代码说helper.encode(manual, [col_predict]),但是手册ofc没有col_predict

请记住,我是一个完整的初学者,我在网上搜索了很多,但我无法提供合适的源代码/教程来让我测试单个集合。

完整代码可在here中找到。

1 个答案:

答案 0 :(得分:1)

尝试一下:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

data = pd.read_csv('agaricus-lepiota.data.txt', header=None) #read data
data.rename(columns={0: 'y'}, inplace = True) #rename predict column (edible or not)

le = LabelEncoder() # encoder to do label encoder

data = data.apply(lambda x: le.fit_transform(x)) #apply LE to all columns

X = data.drop('y', 1) # X without predict column
y = data['y'] #predict column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = GradientBoostingClassifier()#you can pass arguments

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test) #it is predict for objects in test

print(accuracy_score(y_test, y_pred)) #check accuracy

我想您可以在sklearn网站上了解更多有关此内容的信息。 这个例子是您想要的吗?

要检查您的手册数据:

manual = manual.apply(lambda x: le.fit_transform(x))
clf.predict(manual)