我有一个带有位置的纬度和经度列表,另一个只有纬度和经度的列表。我需要将此另一组映射到第一个列表中的近似位置。我尝试在R中使用geosphere,但数据太大,最终收到一条错误消息,提示“无法分配大小为718.5 GB的向量”!有任何想法吗?我们要映射的数据非常庞大(接近100M行,分为48段,需要映射到经纬度列表(大约80k条记录...)
答案 0 :(得分:1)
脱离RomanLuštrik的想法,将其分成尽可能小的块将是您最理想的解决方案。让我们开始逐行查找最接近的点,而不是尝试一次将它们全部加载到内存中。此示例将是基于SAS的解决方案。
通过遍历哈希表也可以更有效地完成此示例,但是在这里解释起来会更加复杂。这也可以并行化。这种方法具有中等效率,但更易于遵循。为此,我们使用两个示例数据集:
1。 Mobile_Activity_3months_scrambled.csv -http://js.cit.datalens.api.here.com/datasets/starter_pack/Mobile_activity_3months_scrambled.csv
50万行。让我们考虑一下您的大型数据集。
2。 sashelp.zipcode
41k行。让我们考虑一下您的小型数据集。
目标:将每个数据点映射到最近的城市。
为使此过程尽可能简单,让我们仅阅读一行并将其与最近的城市匹配。首先,读入您的数据:
proc import
file='CHANGE DIRECTORY HERE\Mobile_activity_3months_scrambled.csv'
out=bigdata
dbms=csv
replace;
run;
接下来,我们将读入一行并计算其与所有其他经纬度对的地理距离。我们将使用SQL使用此数据做笛卡尔积。
proc sql noprint;
create table nearest_point as
select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
, t2.city as Nearest_City
from bigdata(obs=1 firstobs=1) as t1
CROSS JOIN
sashelp.zipcode as t2
where NOT missing(t2.x)
order by Distance
;
quit;
输出数据集中的第一个观测值是您最近的距离。
让我们将其概括为多种观察。让我们为其中的10个做一下,但是要稍微提高效率。我们不需要输出所有41k观测值。我们只需要输出最小距离的观察值并将其附加到主表即可。将outobs=1
选项添加到SQL。
%macro nearest_distance;
%do i = 1 %to 10;
proc sql outobs=1 noprint;
create table nearest_point as
select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
, t2.city as Nearest_City
from bigdata(obs=&i. firstobs=&i.) as t1
CROSS JOIN
sashelp.zipcode as t2
where NOT missing(t2.x)
order by Distance
;
quit;
proc append base=all_nearest_points
data=nearest_point
force;
run;
%end;
%mend;
%nearest_distance;
让我们进一步概括一下,并删除对日志的写入以使其更快。让我们甚至将邮政编码数据预加载到内存中,然后进行 all 观察。为了测试示例,我们首先将bigdata
强制设为最大100磅。
data bigdata;
set bigdata(obs=100);
run;
%macro nearest_distance;
%let dsid = %sysfunc(open(bigdata) );
%let n = %sysfunc(attrn(&dsid., nlobs) );
%let rc = %sysfunc(close(&dsid.) );
proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
run;
%do i = 1 %to &n.;
proc sql outobs=1 noprint;
create table nearest_point as
select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
, t2.city as Nearest_City
from bigdata(obs=&i. firstobs=&i.) as t1
CROSS JOIN
sashelp.zipcode as t2
where NOT missing(t2.x)
order by Distance
;
quit;
proc append base=all_nearest_points
data=nearest_point
force;
run;
%end;
proc printto log=log;
run;
%mend;
%nearest_distance;
接下来,让我们对其进行并行化,并完成所有操作。您可以使用threads
选项更改要使用的并行会话的数量。
%macro nearest_distance(threads=5);
/* Parallel submit options */
options
autosignon=yes
sascmd='!sascmd'
;
/* Current session work directory */
%let workdir = %sysfunc(getoption(work) );
/* Total obs in big data */
%let dsid = %sysfunc(open(bigdata) );
%let n = %sysfunc(attrn(&dsid., nlobs) );
%let rc = %sysfunc(close(&dsid.) );
/* Load lookup table to memory */
sasfile sashelp.zipcode load;
/* Prevent writing to session log */
proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
run;
/* Run in &threads parallel sessions */
%do t = 1 %to &threads.;
/* Divide up observations for each thread */
%let firstobs = %sysevalf(&n-(&n/&threads.)*(&threads.-&t+1)+1, floor);
%let obs = %sysevalf(&n-(&n/&threads.)*(&threads.-&t.), floor);
/* Transfer primary session macro variables to each worker session */
%syslput _USER_ / remote=worker&t.;
/* Parallel calculations for data in memory */
rsubmit wait=no remote=worker&t.;
/* We are in a specific session, and must define this as a macro within the session */
%macro thread_loop;
%do i = &firstobs. %to &obs.;
/* Primary session library */
libname workdir "&workdir.";
proc sql outobs=1 noprint;
create table nearest_point as
select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
, t2.city as Nearest_City
from workdir.bigdata(obs=&i. firstobs=&i.) as t1
CROSS JOIN
sashelp.zipcode as t2
where NOT missing(t2.x)
order by Distance
;
quit;
/* Save to primary session library */
proc append base=workdir._all_nearest_points_&t.
data=nearest_point
force;
run;
%end;
%mend;
%thread_loop;
endrsubmit;
%end;
/* Wait for all workers to end */
waitfor _ALL_;
/* Unload zipcode data from memory */
sasfile sashelp.zipcode close;
/* Append all data to the master file */
proc datasets nolist;
/* Delete final appended output data if it already exists */
delete work.all_nearest_points;
%do t = 1 %to &threads.;
append base = all_nearest_points
data = _all_nearest_points_&t.
force
;
%end;
/* Remove tmp files */
delete _all_nearest_points_:;
quit;
/* Restore log */
proc printto log=log;
run;
%mend;
%nearest_distance;