Question

我有一个带有位置的纬度和经度列表，另一个只有纬度和经度的列表。我需要将此另一组映射到第一个列表中的近似位置。我尝试在R中使用geosphere，但数据太大，最终收到一条错误消息，提示“无法分配大小为718.5 GB的向量”！有任何想法吗？我们要映射的数据非常庞大（接近100M行，分为48段，需要映射到经纬度列表（大约80k条记录...）

Answer 1

脱离RomanLuštrik的想法，将其分成尽可能小的块将是您最理想的解决方案。让我们开始逐行查找最接近的点，而不是尝试一次将它们全部加载到内存中。此示例将是基于SAS的解决方案。

通过遍历哈希表也可以更有效地完成此示例，但是在这里解释起来会更加复杂。这也可以并行化。这种方法具有中等效率，但更易于遵循。为此，我们使用两个示例数据集：

1。 Mobile_Activity_3months_scrambled.csv -http://js.cit.datalens.api.here.com/datasets/starter_pack/Mobile_activity_3months_scrambled.csv

50万行。让我们考虑一下您的大型数据集。

2。 sashelp.zipcode

41k行。让我们考虑一下您的小型数据集。

目标：将每个数据点映射到最近的城市。

为使此过程尽可能简单，让我们仅阅读一行并将其与最近的城市匹配。首先，读入您的数据：

proc import 
    file='CHANGE DIRECTORY HERE\Mobile_activity_3months_scrambled.csv'
    out=bigdata
    dbms=csv
    replace;
run;

接下来，我们将读入一行并计算其与所有其他经纬度对的地理距离。我们将使用SQL使用此数据做笛卡尔积。

proc sql noprint;
    create table nearest_point as
        select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
             , t2.city as Nearest_City
        from bigdata(obs=1 firstobs=1) as t1
        CROSS JOIN
             sashelp.zipcode as t2
        where NOT missing(t2.x)
        order by Distance
    ;
quit;

输出数据集中的第一个观测值是您最近的距离。

让我们将其概括为多种观察。让我们为其中的10个做一下，但是要稍微提高效率。我们不需要输出所有41k观测值。我们只需要输出最小距离的观察值并将其附加到主表即可。将outobs=1选项添加到SQL。

%macro nearest_distance;

    %do i = 1 %to 10;
        proc sql outobs=1 noprint;
            create table nearest_point as
                select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                     , t2.city as Nearest_City
                from bigdata(obs=&i. firstobs=&i.) as t1
                CROSS JOIN
                     sashelp.zipcode as t2
                where NOT missing(t2.x)
                order by Distance
            ;
        quit;

        proc append base=all_nearest_points
                    data=nearest_point
                    force;
        run;

    %end;

%mend;
%nearest_distance;

让我们进一步概括一下，并删除对日志的写入以使其更快。让我们甚至将邮政编码数据预加载到内存中，然后进行 all 观察。为了测试示例，我们首先将bigdata强制设为最大100磅。

data bigdata;
    set bigdata(obs=100);
run;

%macro nearest_distance;

    %let dsid   = %sysfunc(open(bigdata) );
    %let n      = %sysfunc(attrn(&dsid., nlobs) );
    %let rc     = %sysfunc(close(&dsid.) );

    proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
    run;

    %do i = 1 %to &n.;
        proc sql outobs=1 noprint;
            create table nearest_point as
                select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                     , t2.city as Nearest_City
                from bigdata(obs=&i. firstobs=&i.) as t1
                CROSS JOIN
                     sashelp.zipcode as t2
                where NOT missing(t2.x)
                order by Distance
            ;
        quit;

        proc append base=all_nearest_points
                    data=nearest_point
                    force;
        run;

    %end;

    proc printto log=log;
    run;

%mend;
%nearest_distance;

接下来，让我们对其进行并行化，并完成所有操作。您可以使用threads选项更改要使用的并行会话的数量。

%macro nearest_distance(threads=5);

    /* Parallel submit options */
    options 
        autosignon=yes
        sascmd='!sascmd'
    ;

    /* Current session work directory */
    %let workdir = %sysfunc(getoption(work) );

    /* Total obs in big data */
    %let dsid   = %sysfunc(open(bigdata) );
    %let n      = %sysfunc(attrn(&dsid., nlobs) );
    %let rc     = %sysfunc(close(&dsid.) );

    /* Load lookup table to memory */
    sasfile sashelp.zipcode load;

    /* Prevent writing to session log */
    proc printto log="%sysfunc(getoption(work) )\_tmp_.txt";
    run;

    /* Run in &threads parallel sessions */
    %do t = 1 %to &threads.;

        /* Divide up observations for each thread */
        %let firstobs = %sysevalf(&n-(&n/&threads.)*(&threads.-&t+1)+1, floor);
        %let obs      = %sysevalf(&n-(&n/&threads.)*(&threads.-&t.),    floor);

        /* Transfer primary session macro variables to each worker session */
        %syslput _USER_ / remote=worker&t.;

        /* Parallel calculations for data in memory */
        rsubmit wait=no remote=worker&t.;

            /* We are in a specific session, and must define this as a macro within the session */
            %macro thread_loop;
                %do i = &firstobs. %to &obs.;

                    /* Primary session library */
                    libname workdir "&workdir.";

                    proc sql outobs=1 noprint;
                        create table nearest_point as
                            select geodist(t1.lat, t1.lon, t2.y, t2.x) as Distance
                                 , t2.city as Nearest_City
                            from workdir.bigdata(obs=&i. firstobs=&i.) as t1
                            CROSS JOIN
                                 sashelp.zipcode as t2
                            where NOT missing(t2.x)
                            order by Distance
                        ;
                    quit;

                    /* Save to primary session library */
                    proc append base=workdir._all_nearest_points_&t.
                                data=nearest_point
                                force;
                    run;
                %end;
            %mend;
            %thread_loop;

        endrsubmit;
    %end;

    /* Wait for all workers to end */
    waitfor _ALL_;

    /* Unload zipcode data from memory */
    sasfile sashelp.zipcode close;

    /* Append all data to the master file */
    proc datasets nolist;

        /* Delete final appended output data if it already exists */
        delete work.all_nearest_points;

        %do t = 1 %to &threads.;
            append base =  all_nearest_points
                   data = _all_nearest_points_&t.
                   force
            ;
        %end;

        /* Remove tmp files */
        delete _all_nearest_points_:;
    quit;

    /* Restore log */
    proc printto log=log;
    run;

%mend;
%nearest_distance;

映射地理坐标的方法，以便在经纬度的另一个配置文件中标记数据？

1 个答案: