想要计算每个群集中的instagram /酒店比率。但结果显示我实际上将非常遥远的点聚集在一起。这不是DBSCAN的可能性。怎么了?
程序:使用DBSCAN对Instagram帖子进行分组,然后使用1NN对酒店进行分类。
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KNeighborsClassifier
if __name__ == '__main__':
inst = pd.read_csv('inst.csv', encoding='utf-8')
ht = pd.read_csv('ht.csv', encoding='utf-8')
inst = inst[(inst.lat >= 48.30) & (inst.lng >= -139.06) & (inst.lat <= 60.00) & (inst.lng <= -114.03)]
ht = ht[(ht.lat >= 48.30) & (ht.lng >= -139.06) & (ht.lat <= 60.00) & (ht.lng <= -114.03)]
# kmean = KMeans(n_clusters=50,n_jobs=-1)
# kmean.fit(inst[['lat', 'lng']])
#
# ht_labels = kmean.predict(ht[['lat', 'lng']])
# inst_labels = kmean.predict(inst[['lat', 'lng']])
#
# plt.scatter(ht.lng, ht.lat, c=ht_labels, alpha=0.5)
# plt.savefig('./fig/hotel_clusters.png')
# plt.clf()
#
# plt.scatter(inst.lng, inst.lat, c=inst_labels, alpha=0.5)
# plt.savefig('./fig/instagram_posts_clusters.png')
# plt.clf()
dbs = DBSCAN(eps=0.05,min_samples=10,metric='haversine', n_jobs=-1)
ht_labels = dbs.fit_predict(ht[['lat', 'lng']])
inst_labels = dbs.fit_predict(inst[['lat', 'lng']])
plt.scatter(ht.lng, ht.lat, c=ht_labels, alpha=0.5)
plt.savefig('./fig/hotel_clusters1.png')
plt.clf()
plt.scatter(inst.lng, inst.lat, c=inst_labels, alpha=0.5)
plt.savefig('./fig/instagram_posts_clusters1.png')
plt.clf()
knn = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
knn.fit(inst[['lat', 'lng']], inst_labels)
ht_labels = knn.predict(ht[['lat', 'lng']])
plt.scatter(ht.lng, ht.lat, c=ht_labels, alpha=0.5)
plt.savefig('./fig/hotel_clusters3.png')
plt.clf()
plt.scatter(inst.lng, inst.lat, c=inst_labels, alpha=0.5)
plt.savefig('./fig/instagram_posts_clusters3.png')
plt.clf()
ht = ht[['lat', 'lng']]
ht['lb'] = ht_labels
inst = inst[['lat', 'lng']]
inst['lb'] = inst_labels
ht1 = ht.groupby(['lb']).count().reset_index().set_index('lb')
inst1 = inst.groupby(['lb']).count().reset_index().set_index('lb')
print(ht1)
ratio = inst1/ht1
print(ratio)
clu = 2
plt.scatter(ht[ht.lb == clu].lng, ht[ht.lb == clu].lat, c='black')
print(len(ht[ht.lb == clu]))
plt.scatter(ht.lng, ht.lat, c=ht_labels, alpha=0.05)
plt.savefig('./fig/hotel_clusters4.png')
plt.clf()
我找不到这个bug,你能帮忙吗?
答案 0 :(得分:1)
Sklearn havrsine期望坐标以弧度表示,而不是度数。
因此我认为mod 2pi头点很接近。