我正在做的是:
我的方式是完全蛮力的方法,我该如何改进呢? (因为蛮力方式太慢了)
# compute distance with latitude and longitude
def space_dist(c1, c2):
return math.sqrt((float(c1[0])-float(c2[0]))*(float(c1[0])-float(c2[0]))+(float(c1[1])-float(c2[1]))*(float(c1[1])-float(c2[1])))
def _cluster_by_dist(i, threshold=500):
address = address_group[i]
threshold = threshold / 27.0 / 3600.0
ids = [i]
for oidx, o in enumerate(address_group):
if oidx == i:
continue
dist = space_dist((address[i_x], address[i_y]), (o[i_x], o[i_y]))
if dist >= threshold:
continue
ids.append(oidx)
return ids
def cluster_check():
pool = Pool()
cnt = 0
t = time.time()
for ids in pool.imap(_cluster_by_dist, range(len(address_group))):
cnt += 1
if cnt % 100 == 0:
print("{}: {}".format(cnt, time.time() - t))
t = time.time()
if len(ids) <= 1:
continue
yield([address_group[i] for i in ids])
if __name__ == "__main__":
columns = "id,key,mode,x,y,freq".split(",")
address_group = list(load_raw_data("input_path", columns))
i_key, i_x, i_y = columns.index("key"), columns.index("x"), columns.index("y")
for addresses in cluster_check():
for address in addresses:
print(address)
print()