我正在尝试使用scikit-learn自定义估计器来执行时空DBSCAN聚类算法。我有一个数据集,要弄清楚ST-DBSCAN对数据点的聚类效果如何,有一个属性“ cid”,它是每个给定数据点(集群ID)的基本事实标签。因此,我们的想法是将每个数据点的预测标签与地面真实标签进行匹配,从而为聚类算法“ ST-DBSCAN”分配一个准确度得分。
我的代码如下:
from sklearn.base import BaseEstimator, ClusterMixin
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.metrics import accuracy_score, precision_score, recall_score
import time
from sklearn.model_selection import RandomizedSearchCV
class ST_DBSCAN(BaseEstimator, ClusterMixin):
"""
Spatio-Temporal DBSCAN algorithm for scikit-learn compatibility
# eps1 spatial neighborhood
# eps2 Time Neighborhood
# minPts the minimum number of points satisfying the double neighborhood
All estimators must have 'get_params()'' and 'set_params()' functions. They are inherited
when you subclass 'BaseEstimator' and the recommendation is not to override these function
(just not state them in definition of your classifier).
"""
def __init__(self, eps1 = 0.5, eps2 = 10, minPts = 5):
'''
All arguments must have default values, so it is possible to
initialize the clustering object without any parameters.
Do not take data as argument here! It should be in fit method.
Parameters should have the same name as attributes.
'''
self.eps1 = eps1
self.eps2 = eps2
self.minPts = minPts
self.predicted_labels = None
def compute_squared_EDM(self, X):
# Calculate the distance matrix of the X matrix-
return squareform(pdist(X, metric='euclidean'))
def fit(self, X, y):
'''
Here, you should implement all the hard work. At first you should check the parameters.
Secondly, you should take and process the data. You'll almost surely want to add some new
attributes to your object which are created in fit() method. These should be ended by _ at the
end, e.g. self.fitted_.
And finally you should return 'self'. This is again for compatibility reasons with common interface of scikit-learn.
'''
# Get the rows and columns of data (a total of 'n' data)
# n, m = self.data.shape
self.n_, self.m_ = X.shape
# Calculate time distance matrix
self.timeDisMat_ = self.compute_squared_EDM(X[:,0].reshape(self.n_, 1))
# Get space distance matrix-
self.disMat_ = self.compute_squared_EDM(X[:, 1:])
# Assign the number less than minPts in the matrix to 1, the number greater than minPts to zero, then 1 to sum each
# row, and then find the index of the core point coordinates
# Note: Two uses of np.where((search, replace function))
self.core_points_index_ = np.where(np.sum(np.where((self.disMat_ <= self.eps1) &
(self.timeDisMat_ <= self.eps2), 1, 0), axis=1) >= self.minPts)[0]
# Initialization category, -1 means unclassified-
self.labels_ = np.full((self.n_,), -1)
self.clusterId_ = 0
# Iterate through all the core points-
for pointId in self.core_points_index_:
# If the core point is not classified, use it as the seed point and start to find the corresponding cluster
if (self.labels_[pointId] == -1):
# Mark pointId as the current category (that is, identified as operated)
self.labels_[pointId] = self.clusterId_
# Find the eps neighborhood of the seed point and the points that are not classified, and put it into the seed set-
self.neighbour_ = np.where((self.disMat_[:, pointId] <= self.eps1) &
(self.timeDisMat_[:, pointId] <= self.eps2) & (self.labels_ == -1))[0]
self.seeds_ = set(self.neighbour_)
# Through seed points, start to grow, find data points with reachable density, until the seed set is empty,
# one cluster set is searched
while len(self.seeds_) > 0:
# Pop up a new seed point-
newPoint = self.seeds_.pop()
# Mark newPoint as the current class-
self.labels_[newPoint] = self.clusterId_
# Find newPoint seed point eps neighborhood (including itself)
self.queryResults_ = set(np.where((self.disMat_[:,newPoint] <= self.eps1) &
(self.timeDisMat_[:, newPoint] <= self.eps2) )[0])
# If newPoint belongs to the core point, then newPoint can be expanded, that is, the density can be reached
# through newPoint
if len(self.queryResults_) >= self.minPts:
# Push the points in the neighborhood that are not classified into the seed set
for resultPoint in self.queryResults_:
if self.labels_[resultPoint] == -1:
self.seeds_.add(resultPoint)
# After the cluster grows, find a category
self.clusterId_ = self.clusterId_ + 1
self.predicted_labels = self.labels_
# return self.labels_
return self
def score(self, X, y):
return accuracy_score(y, self.predicted_labels)
def get_labels(self):
return self.predicted_labels
我拥有的样本数据集具有以下属性-“帧”(对于时间),“ x”和“ y”(对于空间方面)和“ cid”(每个数据点的基本事实)。代码是:
# Read in CSV file-
data = pd.read_csv("Clustering_Ground_Truth_Data.csv")
# Take the first 1000 records-
data_mod = data.loc[:1000, ['frame', 'x', 'y']]
data_mod.shape
# (1001, 3)
# Get numpy values instead of Pandas DataFrame-
data_mod = data_mod.values
X = data.loc[:1000, ['frame', 'x', 'y']]
y = data.loc[:1000, 'cid']
X = X.values
y = y.values
# Get shapes-
X.shape, y.shape
# ((1001, 3), (1001,))
要使用上述集群代码-
# Initialize an instance of 'ST_DBSCAN' class-
stdb = ST_DBSCAN(0.1, 60, 5)
# Perform ST-DBSCAN clustering and return labels for each
# data point-
stdb.fit(X, y)
# labels = stdb.fit(X, y)
labels = stdb.get_labels()
# Cluster labels. Noisy samples are given the label -1
# Get elements and their counts-
unique, counts = np.unique(labels, return_counts=True)
# Create a dictionary such that-
# element: count
element_count = dict(zip(unique, counts))
print("\nPredictions of data points using ST-DBSCAN, element_count:\n{0}\n\n".format(element_count))
print("\nAccuracy score: {0:.4f}\n".format(stdb.score(X, y)))
此代码运行良好。但是,使用“ RandomizedSearchCV”进行超参数搜索会产生错误:
# RandomizedSearchCV parameters-
random_params = {
'eps1': [0.1, 0.01, 0.001],
'eps2' : [x for x in range(40, 101, 5)]
}
rf_st_dbscan = RandomizedSearchCV(
estimator = stdb,
param_distributions = random_params
# scoring = 'accuracy'
)
# This line gives error-
rf_st_dbscan.fit(X, y)
给出的错误如下:
> ValueError Traceback (most recent call
> last) <ipython-input-15-90f41752b564> in <module>
> ----> 1 rf_st_dbscan.fit(X, y)
>
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
> in fit(self, X, y, groups, **fit_params)
> 708 return results
> 709
> --> 710 self._run_search(evaluate_candidates)
> 711
> 712 # For multi-metric evaluation, store the best_index_, best_params_ and
>
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
> in _run_search(self, evaluate_candidates) 1480 def
> _run_search(self, evaluate_candidates): 1481 """Search n_iter candidates from param_distributions"""
> -> 1482 evaluate_candidates(ParameterSampler( 1483 self.param_distributions, self.n_iter, 1484
> random_state=self.random_state))
>
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py
> in evaluate_candidates(candidate_params)
> 680 n_splits, n_candidates, n_candidates * n_splits))
> 681
> --> 682 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
> 683 X, y,
> 684 train=train, test=test,
>
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> __call__(self, iterable) 1002 # remaining jobs. 1003 self._iterating = False
> -> 1004 if self.dispatch_one_batch(iterator): 1005 self._iterating = self._original_iterator is not None 1006
>
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> dispatch_one_batch(self, iterator)
> 833 return False
> 834 else:
> --> 835 self._dispatch(tasks)
> 836 return True
> 837
>
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> _dispatch(self, batch)
> 752 with self._lock:
> 753 job_idx = len(self._jobs)
> --> 754 job = self._backend.apply_async(batch, callback=cb)
> 755 # A job can complete so quickly than its callback is
> 756 # called before we get here, causing self._jobs to
>
> ~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in
> apply_async(self, func, callback)
> 207 def apply_async(self, func, callback=None):
> 208 """Schedule a func to be run"""
> --> 209 result = ImmediateResult(func)
> 210 if callback:
> 211 callback(result)
>
> ~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in
> __init__(self, batch)
> 588 # Don't delay the application, to avoid keeping the input
> 589 # arguments in memory
> --> 590 self.results = batch()
> 591
> 592 def get(self):
>
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> __call__(self)
> 253 # change the default number of processes to -1
> 254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
> --> 255 return [func(*args, **kwargs)
> 256 for func, args, kwargs in self.items]
> 257
>
> ~/.local/lib/python3.8/site-packages/joblib/parallel.py in
> <listcomp>(.0)
> 253 # change the default number of processes to -1
> 254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
> --> 255 return [func(*args, **kwargs)
> 256 for func, args, kwargs in self.items]
> 257
>
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py
> in _fit_and_score(estimator, X, y, scorer, train, test, verbose,
> parameters, fit_params, return_train_score, return_parameters,
> return_n_test_samples, return_times, return_estimator, error_score)
> 542 else:
> 543 fit_time = time.time() - start_time
> --> 544 test_scores = _score(estimator, X_test, y_test, scorer)
> 545 score_time = time.time() - start_time - fit_time
> 546 if return_train_score:
>
> ~/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py
> in _score(estimator, X_test, y_test, scorer)
> 589 scores = scorer(estimator, X_test)
> 590 else:
> --> 591 scores = scorer(estimator, X_test, y_test)
> 592
> 593 error_msg = ("scoring must return a number, got %s (%s) "
>
> ~/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in
> __call__(self, estimator, *args, **kwargs)
> 87 *args, **kwargs)
> 88 else:
> ---> 89 score = scorer(estimator, *args, **kwargs)
> 90 scores[name] = score
> 91 return scores
>
> ~/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in
> _passthrough_scorer(estimator, *args, **kwargs)
> 369 def _passthrough_scorer(estimator, *args, **kwargs):
> 370 """Function that wraps estimator.score"""
> --> 371 return estimator.score(*args, **kwargs)
> 372
> 373
>
> ~/University_of_Konstanz/Hiwi/Unsupervised_Learning_Works/Spatio-temporal-Clustering-master/Custom_Estimator_scikit_learn_Tutorials.py in score(self, X, y)
> 134
> 135 def score(self, X, y):
> --> 136 return accuracy_score(y, self.predicted_labels)
> 137
> 138
>
> ~/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py
> in accuracy_score(y_true, y_pred, normalize, sample_weight)
> 183
> 184 # Compute accuracy for each possible representation
> --> 185 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
> 186 check_consistent_length(y_true, y_pred, sample_weight)
> 187 if y_type.startswith('multilabel'):
>
> ~/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py
> in _check_targets(y_true, y_pred)
> 78 y_pred : array or indicator matrix
> 79 """
> ---> 80 check_consistent_length(y_true, y_pred)
> 81 type_true = type_of_target(y_true)
> 82 type_pred = type_of_target(y_pred)
>
> ~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in
> check_consistent_length(*arrays)
> 209 uniques = np.unique(lengths)
> 210 if len(uniques) > 1:
> --> 211 raise ValueError("Found input variables with inconsistent numbers of"
> 212 " samples: %r" % [int(l) for l in lengths])
> 213
>
> ValueError: Found input variables with inconsistent numbers of
> samples: [201, 800]
我为什么会收到此“ ValueError”?