距离计算南是最大可能距离

时间:2018-06-05 09:41:16

标签: python distance nan euclidean-distance cosine-similarity

我真的尽力找到问题的解决方案。鉴于我有2个客户,其中包含以下几个属性;

cust1 = [4.0, 75.0, 2.0, 155.0, 58.0, 3.0, 7.0, 4.0, 0.0, 4.0, 0.0, 1.0, 1.0, nan, 2.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, nan, 13.0, 8260.0, 41.5, 91.0, 65.0, 4.8, 5.7, 7.4, 4.2, 301.0, 31.0, 91.0, 10.0, 196.0, 121.0, 139.0, 4.7, 10.0, 54.0, 72.0, 1.1, 225.0, 42.0, 0.69, 104.0, nan, 59.3, 13.0, 41.5, 100.0, 31.1, 65.0, 4.8, 139.0, 4.7, 11.0, 57.0, nan, nan, nan, nan, 11.0, 1.4, 138.0, 2.7, 9.0, 2.0, 1.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 2.0, nan, 5.0, 5.0, 5.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 1.0, 2.0, 3.0, 4.0, 2.0, 2.0, 2.0, 5.0, 5.0, 5.0, 5.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 2.0, 1.0, 2.0, 1.0, 0.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
cust2 = [4.0, 78.0, 2.0, 160.0, 62.0, 3.0, 5.0, 3.0, 0.0, 6.0, 0.0, 1.0, 1.0, nan, 24.0, 0.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, nan, 11.4, 7620.0, 35.4, 92.0, 106.0, 5.2, 5.4, 7.2, 4.5, 206.0, 40.0, 23.0, 6.0, 169.0, 102.0, 137.0, 6.0, 9.2, 76.0, 38.0, 3.5, 927.0, 83.0, 0.66, 84.0, nan, 64.2, 11.4, 35.4, 100.0, 32.2, 106.0, 5.2, 137.0, 6.0, 10.0, 62.6, nan, nan, nan, nan, 23.0, 1.5, 138.0, 3.4, 9.0, 2.0, 1.0, nan, 2.0, 2.0, 2.0, 2.0, 1.0, 4.0, 1.0, 2.0, nan, 5.0, 5.0, 4.0, 3.0, 3.0, 1.0, 3.0, 3.0, 3.0, 2.0, 2.0, 4.0, 5.0, 2.0, 3.0, 2.0, 5.0, 5.0, 5.0, 4.0, 1.0, 3.0, 1.0, 1.0, 2.0, 2.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 3.0, 0.0, 3.0, 1.0, 0.0, 1.0, 3.0, 0.0, 2.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

我想找到这些客户之间的相似之处。问题是,正如您所看到的,数据包含缺失值。我想要考虑具有缺失值的属性是最大可能距离。我在这个项目中使用python。我厌倦了cosine_distance,distance.euclidean。他们都没有处理缺失值。

我的数据包含200多个此类客户。我的总体目标是找到5个最相似的客户。

from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
import numpy as np
simillist =[]
hepsi = []
dfyeni = df.apply(pd.to_numeric, errors = 'coerce')
dfyeni[np.isnan(dfyeni)] = 0
simi = cosine_similarity(dfyeni) #calculate similarity for the whole dataframe
for rowsira, row in df.iterrows():
for y in df.columns:
    if df.iloc[rowsira,y] == "?":
        simillist = simi[rowsira] #retrieve similarity of customer with ?     to other customers
        yenilist = sorted(range(len(simillist)),key = lambda k: simillist[k]) #retrieve indexes from biggest
        hepsi = []
        for i in range(5): #similarity count 5 olduğunu varsayıyorum
            z = i+1 #en büyük her zaman 1 olduğu için
            closeindex = yenilist[z] #en yüksek similarity'nin index değerini çekiyorum
            for hnd in range(len(yenilist)):
            if df.iloc[closeindex,y] != "?" or np.isnan(df.iloc[closeindex,y])== False:
                continue
            else:
                closeindex = closeindex+1
            closeatr = df.iloc[closeindex, y]
            hepsi.append(closeatr)

0 个答案:

没有答案