正在寻找有关本地计算机或群集(Python,R,JavaScript,任何语言)的算法的帮助。
我有一个带有坐标的位置列表。
# R script
n <- 10
set.seed(1)
index <- paste0("id_",c(1:n))
lat <- runif(n, 32.0, 41)
lon <- runif(n, 84, 112)*(-1)
values <- as.integer(runif(n, 50, 100))
df <- data.frame(index, lat, lon, values, stringsAsFactors = FALSE)
names(df) <- c('loc_id','lat','lon', 'value')
loc_id lat lon value
1 id_1 34.38958 -89.76729 96
2 id_2 35.34912 -88.94359 60
3 id_3 37.15568 -103.23664 82
4 id_4 40.17387 -94.75490 56
5 id_5 33.81514 -105.55556 63
6 id_6 40.08551 -97.93558 69
7 id_7 40.50208 -104.09332 50
8 id_8 37.94718 -111.77337 69
9 id_9 37.66203 -94.64099 93
10 id_10 32.55608 -105.76847 67
我需要在表中的每个位置找到3个壁橱位置。
这是我在R中的代码:
# R script
require(dplyr)
require(geosphere)
start.time <- Sys.time()
d1 <- df
sample <- 999999999999
distances <- list("init1" = sample, "init2" = sample, "init3" = sample)
d1$distances <- apply(d1, 1, function(x){distances})
n_rows = nrow(d1)
for (i in 1:(n_rows-1)) {
# current location
dot1 <- c(d1$lon[i], d1$lat[i])
for (k in (i+1):n_rows) {
# next location
dot2 <- c(d1$lon[k], d1$lat[k])
# distance between locations
meters_between <- as.integer(distm(dot1, dot2, fun = distHaversine))
# updating current location distances
distances <- d1$distances[[i]]
distances[d1$loc_id[k]] <- meters_between
d1$distances[[i]] <- distances[order(unlist(distances), decreasing=FALSE)][1:3]
# updating next location distances
distances <- d1$distances[[k]]
distances[d1$loc_id[i]] <- meters_between
d1$distances[[k]] <- distances[order(unlist(distances), decreasing=FALSE)][1:3]
}
}
但是需要太多时间:
# [1] "For 10 rows and 45 iterations takes 0.124729156494141 sec. Average sec 0.00277175903320313 per row."
# [1] "For 100 rows and 4950 iterations takes 2.54944682121277 sec. Average sec 0.000515039761861165 per row."
# [1] "For 200 rows and 19900 iterations takes 10.1178169250488 sec. Average sec 0.000508433011308986 per row."
# [1] "For 500 rows and 124750 iterations takes 73.7151870727539 sec. Average sec 0.000590903303188408 per row."
我在Python中做了同样的事情:
# Python script
import pandas as pd
import numpy as np
n = 10
np.random.seed(1)
data_m = np.random.uniform(0, 5, 5)
data = {'loc_id':range(1, n+1),
'lat':np.random.uniform(32, 41, n),
'lon':np.random.uniform(84, 112, n)*(-1),
'values':np.random.randint(50, 100, n)}
df = pd.DataFrame(data)[['loc_id', 'lat', 'lon', 'values']]
df['loc_id'] = df['loc_id'].apply(lambda x: 'id_{0}'.format(x))
df = df.reset_index().drop('index', axis = 1).set_index('loc_id')
from geopy.distance import distance
from datetime import datetime
start_time = datetime.now()
sample = 999999999999
df['distances'] = np.nan
df['distances'] = df['distances'].apply(lambda x: [{'init1': sample}, {'init2': sample}, {'init3': sample}])
n_rows = len(df)
rows_done = 0
for i, row_i in df.head(n_rows-1).iterrows():
dot1 = (row_i['lat'], row_i['lon'])
rows_done = rows_done + 1
for k, row_k in df.tail(n_rows-rows_done).iterrows():
dot2 = (row_k['lat'], row_k['lon'])
meters_between = int(distance(dot1,dot2).meters)
distances = df.at[i, 'distances']
distances.append({k: meters_between})
distances_sorted = sorted(distances, key=lambda x: x[next(iter(x))])[:3]
df.at[i, 'distances'] = distances_sorted
distances = df.at[k, 'distances']
distances.append({i: meters_between})
distances_sorted = sorted(distances, key=lambda x: x[next(iter(x))])[:3]
df.at[k, 'distances'] = distances_sorted
print df
几乎一样的表现。
有人知道是否有更好的方法?在我的任务中,必须完成90000个位置。甚至考虑过Hadoop / MpRc / Spark,但不知道如何在分布式模式下进行操作。
我很高兴听到任何想法或建议。
答案 0 :(得分:4)
如果欧几里得距离合适,那么nn2
使用kd树和C代码,因此应该很快:
library(RANN)
nn2(df[2:3], k = 4)
在我不是特别快的笔记本电脑上,这总共花费了0.06至0.11秒,处理n = 10,000行,而对于90,000行则花费了1.00至1.25秒。
答案 1 :(得分:2)
我可以提供python
的{{1}}解决方案
scipy
答案 2 :(得分:0)
这是使用C ++和我的库解决此问题的方法 GeographicLib(1.47版或更高版本)。这使用了真正的椭球测地线 距离和 vantage point tree 优化对最近邻居的搜索。
#include <exception>
#include <vector>
#include <fstream>
#include <string>
#include <GeographicLib/NearestNeighbor.hpp>
#include <GeographicLib/Geodesic.hpp>
using namespace std;
using namespace GeographicLib;
// A structure to hold a geographic coordinate.
struct pos {
string id;
double lat, lon;
pos(const string& _id = "", double _lat = 0, double _lon = 0) :
id(_id), lat(_lat), lon(_lon) {}
};
// A class to compute the distance between 2 positions.
class DistanceCalculator {
private:
Geodesic _geod;
public:
explicit DistanceCalculator(const Geodesic& geod) : _geod(geod) {}
double operator() (const pos& a, const pos& b) const {
double d;
_geod.Inverse(a.lat, a.lon, b.lat, b.lon, d);
if ( !(d >= 0) )
// Catch illegal positions which result in d = NaN
throw GeographicErr("distance doesn't satisfy d >= 0");
return d;
}
};
int main() {
try {
// Read in pts
vector<pos> pts;
string id;
double lat, lon;
{
ifstream is("pts.txt"); // lines of "id lat lon"
if (!is.good())
throw GeographicErr("pts.txt not readable");
while (is >> id >> lon >> lat)
pts.push_back(pos(id, lat, lon));
if (pts.size() == 0)
throw GeographicErr("need at least one location");
}
// Define a distance function object
DistanceCalculator distance(Geodesic::WGS84());
// Create NearestNeighbor object
NearestNeighbor<double, pos, DistanceCalculator>
ptsset(pts, distance);
vector<int> ind;
int n = 3; // Find 3 nearest neighbors
for (unsigned i = 0; i < pts.size(); ++i) {
ptsset.Search(pts, distance, pts[i], ind,
n, numeric_limits<double>::max(),
// exclude the point itself
0.0);
if (ind.size() != n)
throw GeographicErr("unexpected number of results");
cout << pts[i].id;
for (unsigned j = 0; j < ind.size(); ++j)
cout << " " << pts[ind[j]].id;
cout << "\n";
}
int setupcost, numsearches, searchcost, mincost, maxcost;
double mean, sd;
ptsset.Statistics(setupcost, numsearches, searchcost,
mincost, maxcost, mean, sd);
long long
totcost = setupcost + searchcost,
exhaustivecost = ((pts.size() - 1) * pts.size())/2;
cerr
<< "Number of distance calculations = " << totcost << "\n"
<< "With an exhaustive search = " << exhaustivecost << "\n"
<< "Ratio = " << double(totcost) / exhaustivecost << "\n"
<< "Efficiency improvement = "
<< 100 * (1 - double(totcost) / exhaustivecost) << "%\n";
}
catch (const exception& e) {
cerr << "Caught exception: " << e.what() << "\n";
return 1;
}
}
这会读取pts.txt的一组点(格式为“ id lat lon”), 将它们放在VP树中。然后针对每个点查找最近的3个 邻居并打印邻居的ID和ID(按 距离)。
使用例如
进行编译g++ -O3 -o nearest nearest.cpp -lGeographic
如果pts.txt包含90000点,则计算在 在完成大约3380000距离后,在我的家用计算机上大约需要6秒(或每点70微秒) 计算。这个 比蛮力计算高约1200倍的效率 (执行所有 N ( N − 1)/ 2个距离计算)。
您可以通过使用粗略的代码来加快速度(减少“几分”) 近似于距离(例如球形或欧几里得);只是 适当地修改DistanceCalculator类。例如这个 DistanceCalculator的版本返回的球面距离为 度:
// A class to compute the spherical distance between 2 positions.
class DistanceCalculator {
public:
explicit DistanceCalculator(const Geodesic& /*geod*/) {}
double operator() (const pos& a, const pos& b) const {
double sphia, cphia, sphib, cphib, somgab, comgab;
Math::sincosd(a.lat, sphia, cphia);
Math::sincosd(b.lat, sphib, cphib);
Math::sincosd(Math::AngDiff(a.lon, b.lon), somgab, comgab);
return Math::atan2d(Math::hypot(cphia * sphib - sphia * cphib * comgab,
cphib * somgab),
sphia * sphib + cphia * cphib * comgab);
}
};
但是现在您承担了确保近似值的额外负担 足够好我建议只使用正确的测地距离 首先。
给出了在GeographicLib中实现VP树的详细信息 here。