Question

我尝试在Iris数据集the starting code can be found here中绘制SVM分类器。我扩展了一个pandas DataFrame，其中有四个额外的列，我想以相同的方式绘制。

我使用代码创建了四个额外的列：

iris = iris.assign(SepalRatio = iris['SepalLengthCm'] / iris['SepalWidthCm']).assign(PetalRatio = iris['PetalLengthCm'] / iris['PetalWidthCm']).assign(SepalMultiplied = iris['SepalLengthCm'] * iris['SepalWidthCm']).assign(PetalMultiplied = iris['PetalLengthCm'] * iris['PetalWidthCm'])

我还制作了一个额外的specieID colunm：

d = {"Iris-setosa" : 0, "Iris-versicolor": 1, "Iris-virginica": 2}
iris['SpecieID'] = iris['Species'].map(d).fillna(-1)

然后，我从DataFrame中提取了一些列以进行绘图，但在绘图之后我得到了错误：

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-49-9724675f32fa> in <module>()
     77 xx, yy = make_meshgrid(X0, X1)
     78 
---> 79 for clf, title, ax in zip(models, titles, sub.flatten()):
     80     plot_contours(ax, clf, xx, yy,
     81                   cmap=plt.cm.coolwarm, alpha=0.8)

<ipython-input-49-9724675f32fa> in <genexpr>(.0)
     62           svm.SVC(kernel='rbf', gamma=0.7, C=C),
     63           svm.SVC(kernel='poly', degree=3, C=C))
---> 64 models = (clf.fit(X, y) for clf in models)
     65 
     66 # title for the plots

C:\Users\masc\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\svm\base.py in fit(self, X, y, sample_weight)
    150 
    151         X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
--> 152         y = self._validate_targets(y)
    153 
    154         sample_weight = np.asarray([]

C:\Users\masc\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\svm\base.py in _validate_targets(self, y)
    518     def _validate_targets(self, y):
    519         y_ = column_or_1d(y, warn=True)
--> 520         check_classification_targets(y)
    521         cls, y = np.unique(y_, return_inverse=True)
    522         self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)

C:\Users\masc\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
    170     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    171             'multilabel-indicator', 'multilabel-sequences']:
--> 172         raise ValueError("Unknown label type: %r" % y_type)
    173 
    174 

ValueError: Unknown label type: 'unknown'

我的修改后的代码是：

from sklearn import svm

iris = pd.read_csv("Iris.csv") # the iris dataset is now a Pandas DataFrame

def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


# import some data to play with
#iris = datasets.load_iris()

iris_numpy_array = iris.as_matrix(columns=None)

print (iris_numpy_array)

# Take the first two features. We could avoid this by using a two-dim dataset
X = iris_numpy_array[:, [1, 2]]

print (X)

y = iris_numpy_array[:, [10]]
y = y.ravel()

print (y)

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(X, y) for clf in models)

# title for the plots
titles = ('SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel')

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()

代码示例中的X和Y的内容与我的代码中的内容相同，但唯一的区别是从pandas DataFrame中提取的内容。

原始代码是：

print(__doc__)

iris = pd.read_csv("Iris.csv") # the iris dataset is now a Pandas DataFrame

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets


def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


# import some data to play with
iris = datasets.load_iris()
# Take the first two features. We could avoid this by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(X, y) for clf in models)

# title for the plots
titles = ('SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel')

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()

Answer 1

我通过使用另一个也创建SVM图的模板解决了这个问题：

from sklearn import svm
from mlxtend.plotting import plot_decision_regions

X = iris[['SepalLengthCm', 'SepalWidthCm']]
y = iris['SpecieID']

clf = svm.SVC(decision_function_shape = 'ovo')
clf.fit(X.values, y.values) 

# Plot Decision Region using mlxtend's awesome plotting function
plot_decision_regions(X=X.values, 
                      y=y.values,
                      clf=clf, 
                      legend=2)

# Update plot object with X/Y axis labels and Figure Title
plt.xlabel(X.columns[0], size=14)
plt.ylabel(X.columns[1], size=14)
plt.title('SVM Decision Region Boundary', size=16)

此代码给出了图：

Answer 2

我也尝试这样做，最后（经过很多小时）这是我的代码：

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
iris = load_iris()

# show data
# print(iris)

# show data columns
# print(iris['feature_names'])

# ========================================

# create dataframe
df = pd.DataFrame(iris['data'], columns=iris['feature_names'])
# print(df.head()) 

# tambahkan kolom 'target' => mencerminkan spesies
df['target'] = iris['target']
# print(df.head())

# target mencerminkan nama spesies di target_names
# print(iris['target_names'])

# tambahkan daftar spesies ke dataframe
df['spesies'] = df['target'].apply(lambda x: iris['target_names'][x])
# print(df)

# ========================================

# pisahkan df untuk setiap spesies
df0 = df[df['target'] == 0]     # setosa
df1 = df[df['target'] == 1]     # versicolor
df2 = df[df['target'] == 2]     # virginica

print(df0.head())
print(df1.head())
print(df2.head())

# =======================================

# plot data
fig = plt.figure('Iris Data', figsize=(14,7))

# plot data sepal length vs sepal width
plt.subplot(121)
plt.scatter(df0['sepal length (cm)'], df0['sepal width (cm)'], color='r', marker ='o')
plt.scatter(df1['sepal length (cm)'], df1['sepal width (cm)'], color='y', marker ='o')
plt.scatter(df2['sepal length (cm)'], df2['sepal width (cm)'], color='b', marker ='o')
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.title('Sepal width vs sepal length')
plt.legend(['0 Setosa', '1 Versicolor', '2 Virginica'])
plt.grid(True)

# plot data petal length vs petal width
plt.subplot(122)
plt.scatter(df0['petal length (cm)'], df0['petal width (cm)'], color='r', marker ='o')
plt.scatter(df1['petal length (cm)'], df1['petal width (cm)'], color='y', marker ='o')
plt.scatter(df2['petal length (cm)'], df2['petal width (cm)'], color='b', marker ='o')
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.title('Petal width vs petal length')
plt.legend(['0 Setosa', '1 Versicolor', '2 Virginica'])
plt.grid(True)
plt.show()

# ===========================================

# split dataset into test & train
from sklearn.model_selection import train_test_split
x = df.drop(['target', 'spesies'], axis='columns')  # data utama
y = df['target']                                    # data target

# train dataset dg test = 0.2
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

print(len(x_train))     # 120 = 80%
print(len(x_test))      #  30 = 20% (hasil dari test_size = .2)

# ============================================

# svm
from sklearn.svm import SVC
# model = SVC()
model = SVC(gamma='auto')   # avoid warning

# train data
model.fit(x_train, y_train)

# akurasi
print(model.score(x_test, y_test))

# ===========================================

# prediksi
print(model.predict([[5.1, 3.5, 1.4, 0.2]]))    # output = [0] = spesies setosa
print(model.predict([[7.0, 3.2, 4.7, 1.4]]))    # output = [1] = spesies verticolor
print(model.predict([[5.9, 3.0, 5.1, 1.8]]))    # output = [2] = spesies virginica

# ===========================================

# plot svm

def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, h),
        np.arange(y_min, y_max, h)
    )
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

iris = load_iris()
X = iris['data'][:, :2]
print(X)
y = iris['target']
print(y)

C = 1.0  # SVM regularization parameter
model = SVC(gamma = 'auto')
model = model.fit(X, y)

fig = plt.figure()

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
ax = plt.subplot()

plot_contours(ax, model, xx, yy, cmap='coolwarm', alpha=0.8)
ax.scatter(X0, X1, c=y, cmap='coolwarm', s=50, edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xlabel('Sepal length (cm)')
ax.set_ylabel('Sepal width (cm)')
ax.set_title('Support Vector Machine')
plt.show()

这是最终结果：

ValueError：未知标签类型：在虹膜数据集

2 个答案: