答案 0 :(得分:7)
我发现@Nipun Wijerathne的答案不完整且有点混乱,因此我决定为将来的读者提供MCVE(实际上是 MCVE:D ),但首先让我提出一些一般准则:
正如已经提到的,Euclidean Metric无法找到正确的距离,因为它试图获取 普通 直线距离。 因此,如果我们具有变量的多维空间,则两个点可能看起来与均值的距离相同,但实际上其中之一离数据云很远( 即是极致价值)。
解决方案是Mahalanobis Distance,它通过获取变量Eigenvectors而不是原始轴来实现类似于 功能缩放 的功能。
它应用以下公式:
其中:
x
是观测,用于查找其距离m
是观察值的平均值 S
是Covariance Matrix 刷新:
Covariance表示两个变量(即正,负或零)之间关系的方向,因此它显示了一个变量与另一个变量的变化如何相关的强度。
请考虑以下 6x3 数据集 example ,其中每一行代表一个输入/示例,每一列代表该示例的功能:
首先,我们需要创建每个输入的 功能 的协方差矩阵,这就是为什么我们在{{ 3}}函数,因此每一列代表一个变量:
rowvar
然后我们找到协方差矩阵的 逆 :
covariance_matrix = np.cov(data, rowvar=False)
# data here looks similar to the above 2D / Matrix example in the pictures
但是在继续之前,我们应该检查-如上所述-矩阵及其逆是否为 对称 和 正定 ,我们使用此算法,幸运的是,它已经在Cholesky Decomposition中实现:
inv_covariance_matrix = np.linalg.inv(covariance_matrix)
接下来,我们在每个特征上找到变量的均值def is_pos_def(A):
if np.allclose(A, A.T):
try:
np.linalg.cholesky(A)
return True
except np.linalg.LinAlgError:
return False
else:
return False
(我要说维)并将其保存在这样的数组中:
m
请注意,我重复每一行只是为了利用矩阵减法,如下所示。
接下来,我们找到vars_mean = []
for i in range(data.shape[0]):
vars_mean.append(list(data.mean(axis=0))) # axis=0 means each column in the 2D array
(即差分),但是我们已经向量化了x - m
,所以我们要做的就是:
vars_mean
最后,应用如下公式:
diff = data - vars_mean
# here we subtract the mean of feature from each feature of each example
请注意以下几点:
md = []
for i in range(len(diff)):
md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
number_of_features x number_of_features
矩阵的维类似于原始数据矩阵:diff
number_of_examples x number_of_features
(即行)都是diff[i]
。1 x number_of_features
的结果矩阵将为diff[i].dot(inv_covariance_matrix)
,当我们再次乘以1 x number_of_features
时diff[i]
会自动将后者视为一列矩阵,即numpy
,因此最终结果将成为单个值! (即无需转置)为了检测异常值,我们应该指定阈值;我们将马氏距离结果的平均值乘以极限度number_of_features x 1
,其中极值的 k
和极值的k = 2.0 * std
这是根据(来自同一链接的插图图片):
3.0 * std
import numpy as np
def create_data(examples=50, features=5, upper_bound=10, outliers_fraction=0.1, extreme=False):
'''
This method for testing (i.e. to generate a 2D array of data)
'''
data = []
magnitude = 4 if extreme else 3
for i in range(examples):
if (examples - i) <= round((float(examples) * outliers_fraction)):
data.append(np.random.poisson(upper_bound ** magnitude, features).tolist())
else:
data.append(np.random.poisson(upper_bound, features).tolist())
return np.array(data)
def MahalanobisDist(data, verbose=False):
covariance_matrix = np.cov(data, rowvar=False)
if is_pos_def(covariance_matrix):
inv_covariance_matrix = np.linalg.inv(covariance_matrix)
if is_pos_def(inv_covariance_matrix):
vars_mean = []
for i in range(data.shape[0]):
vars_mean.append(list(data.mean(axis=0)))
diff = data - vars_mean
md = []
for i in range(len(diff)):
md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
if verbose:
print("Covariance Matrix:\n {}\n".format(covariance_matrix))
print("Inverse of Covariance Matrix:\n {}\n".format(inv_covariance_matrix))
print("Variables Mean Vector:\n {}\n".format(vars_mean))
print("Variables - Variables Mean Vector:\n {}\n".format(diff))
print("Mahalanobis Distance:\n {}\n".format(md))
return md
else:
print("Error: Inverse of Covariance Matrix is not positive definite!")
else:
print("Error: Covariance Matrix is not positive definite!")
def MD_detectOutliers(data, extreme=False, verbose=False):
MD = MahalanobisDist(data, verbose)
# one popular way to specify the threshold
#m = np.mean(MD)
#t = 3. * m if extreme else 2. * m
#outliers = []
#for i in range(len(MD)):
# if MD[i] > t:
# outliers.append(i) # index of the outlier
#return np.array(outliers)
# or according to the 68–95–99.7 rule
std = np.std(MD)
k = 3. * std if extreme else 2. * std
m = np.mean(MD)
up_t = m + k
low_t = m - k
outliers = []
for i in range(len(MD)):
if (MD[i] >= up_t) or (MD[i] <= low_t):
outliers.append(i) # index of the outlier
return np.array(outliers)
def is_pos_def(A):
if np.allclose(A, A.T):
try:
np.linalg.cholesky(A)
return True
except np.linalg.LinAlgError:
return False
else:
return False
data = create_data(15, 3, 10, 0.1)
print("data:\n {}\n".format(data))
outliers_indices = MD_detectOutliers(data, verbose=True)
print("Outliers Indices: {}\n".format(outliers_indices))
print("Outliers:")
for ii in outliers_indices:
print(data[ii])
答案 1 :(得分:5)
在多变量数据中,如果变量之间存在协方差,则欧几里德距离会失败(在您的情况X,Y,Z中,即)。
因此,马哈拉诺比斯距离的作用是,
它将变量转换为不相关的空间。
使每个变量varience等于1.
然后计算简单的欧几里德距离。
我们可以按如下方式计算每个数据样本的马哈拉诺比斯距离,
在这里,我提供了python代码并添加了注释,以便您可以理解代码。
import numpy as np
data= np.matrix([[1, 2, 3, 4, 5, 6, 7, 8],[1, 4, 9, 16, 25, 36, 49, 64],[1, 4, 9, 16, 25, 16, 49, 64]])
def MahalanobisDist(data):
covariance_xyz = np.cov(data) # calculate the covarince matrix
inv_covariance_xyz = np.linalg.inv(covariance_xyz) #take the inverse of the covarince matrix
xyz_mean = np.mean(data[0]),np.mean(data[1]),np.mean(data[2])
x_diff = np.array([x_i - xyz_mean[0] for x_i in x]) # take the diffrence between the mean of X variable the sample
y_diff = np.array([y_i - xyz_mean[1] for y_i in y]) # take the diffrence between the mean of Y variable the sample
z_diff = np.array([z_i - xyz_mean[2] for z_i in z]) # take the diffrence between the mean of Z variable the sample
diff_xyz = np.transpose([x_diff, y_diff, z_diff])
md = []
for i in range(len(diff_xyz)):
md.append(np.sqrt(np.dot(np.dot(np.transpose(diff_xyz[i]),inv_covariance_xyz),diff_xyz[i]))) #calculate the Mahalanobis Distance for each data sample
return md
def MD_removeOutliers(data):
MD = MahalanobisDist(data)
threshold = np.mean(MD) * 1.5 # adjust 1.5 accordingly
outliers = []
for i in range(len(MD)):
if MD[i] > threshold:
outliers.append(i) # index of the outlier
return np.array(outliers)
print(MD_removeOutliers(data))
希望这有帮助。
<强>参考,强>