嵌套的FOR循环需要很长的执行时间

时间:2017-02-07 02:28:18

标签: c++ performance for-loop optimization point-clouds

我有运行ubuntu的intel i5(NUC)系统,我正在尝试使用点云开发实时应用程序。

我有一个307200(640x480)迭代的for循环,但有一些条件。代码段是:

void pc_segmentation(pcl::PointCloud<pcl::PointXYZRGB>::Ptr input_cloud,Eigen::Vector4f pt_min,Eigen::Vector4f pt_max)
{
    pcl::PointCloud<pcl::PointXYZRGB> slice_0;
    pcl::PointCloud<pcl::PointXYZRGB> slice_1;
    pcl::PointCloud<pcl::PointXYZRGB> slice_2;
    pcl::PointCloud<pcl::PointXYZRGB> slice_3;
    pcl::PointCloud<pcl::PointXYZRGB> slice_4;
    pcl::PointCloud<pcl::PointXYZRGB> slice_5;
    pcl::PointCloud<pcl::PointXYZRGB> slice_z;
    pcl::PointCloud<pcl::PointXYZRGB> slice_y(input_cloud->width,input_cloud->height);
    pcl::PointCloud<pcl::PointXYZRGB> slice_x(input_cloud->width,input_cloud->height);

    //clearing pointclouds
    slice_0.clear();
    slice_1.clear();
    slice_2.clear();
    slice_3.clear();
    slice_4.clear();
    slice_5.clear();
    slice_z.clear();

    float xmin0=100, xmin1=100, xmin2=100, xmin3=100, xmin4=100, xmin5=100;
    float ymin0=100, ymin1=100, ymin2=100, ymin3=100, ymin4=100, ymin5=100;
    float xmax0=-100, xmax1=-100, xmax2=-100, xmax3=-100, xmax4=-100, xmax5=-100;
    float ymax0=-100, ymax1=-100, ymax2=-100, ymax3=-100, ymax4=-100, ymax5=-100;


    //Slices for Y direction
    Eigen::VectorXf slice_pts_y(NO_OF_SLICES);
    Eigen::VectorXf diff_y(NO_OF_SLICES);
    Eigen::VectorXf slice_pts_x(NO_OF_SLICES_X);
    Eigen::VectorXf diff_x(NO_OF_SLICES_X);

    //Slice Vector Y
    double dist_btw_slice = (pt_max[1] - pt_min[1])/NO_OF_SLICES;
    for(int i=1; i<=NO_OF_SLICES;i++)
        slice_pts_y[i-1] = pt_min[1] + (i*dist_btw_slice);

    //Slice Vector X
    dist_btw_slice = (pt_max[0] - pt_min[0])/NO_OF_SLICES_X;
    for(int i=1; i<=NO_OF_SLICES_X;i++)
        slice_pts_x[i-1] = pt_min[1] + (i*dist_btw_slice);

    ros::Time start_for_loop_time = ros::Time::now();
    for(int ii=0;ii<input_cloud->width;ii++)
        for(int jj=0;jj<input_cloud->height;jj++)
        {
            pcl::PointXYZRGB point = input_cloud->at(ii,jj);
            if((pcl_isfinite(point.z)))
            {
                if((point.z > 0.9)&&(point.z < 1))
                {
                    slice_0.push_back(point);
                }
                else if ((point.z > 1.4)&&(point.z < 1.5))
                {
                    slice_1.push_back(point);
                }
                else if((point.z > 1.9)&&(point.z < 2))
                {
                    slice_2.push_back(point);
                }
                else if ((point.z > 2.4)&&(point.z < 2.5))
                {
                    slice_3.push_back(point);
                }
                else if((point.z > 2.9)&&(point.z < 3))
                {
                    slice_4.push_back(point);
                }
                else if ((point.z > 3.4)&&(point.z < 3.5))
                {
                    slice_5.push_back(point);
                }
            }
            if((pcl_isfinite(point.y)))
            {
                diff_y = slice_pts_y - (point.y * Eigen::VectorXf::Ones(NO_OF_SLICES));
                diff_y=diff_y.array().abs();
                std::sort(diff_y.data(),diff_y.data()+NO_OF_SLICES,std::less<float>());
                if(diff_y[0]<0.005)
                {
                    slice_y.at(ii,jj)=point;
                }
            }
            if((pcl_isfinite(point.x)))
            {
                diff_x = slice_pts_x - (point.x * Eigen::VectorXf::Ones(NO_OF_SLICES_X));
                diff_x=diff_x.array().abs();
                std::sort(diff_x.data(),diff_x.data()+NO_OF_SLICES_X,std::less<float>());
                if(diff_x[0]<0.005)
                {
                    slice_x.at(ii,jj)=point;
                }
            }
        }
    ros::Time end_for_loop_time = ros::Time::now();
    if(seg_pcl_time.is_open())
        seg_pcl_time<<end_for_loop_time.toSec() - start_for_loop_time.toSec()<<std::endl;
    else
        std::cout<<"Segmented Point Cloud for Z slices "<<end_for_loop_time.toSec() - start_for_loop_time.toSec()<<" seconds"<<std::endl;
    ros::Time start_pc_add_time = ros::Time::now();
    slice_z = slice_0 + slice_1;
    slice_z += slice_2;
    slice_z += slice_3;
    slice_z += slice_4;
    slice_z += slice_5;
    pcl::copyPointCloud(slice_z,slices_z_pc); //slices_z_pc is defined globally
    ros::Time end_pc_add_time = ros::Time::now();
    std::cout<<"Adding and Publishing Point Cloud for Z slices "<<end_pc_add_time.toSec() - start_pc_add_time.toSec()<<" seconds"<<std::endl;
    // Publish point clouds
    slices_z_pc.header = input_cloud->header;
    slice_x.header = input_cloud->header;
    slice_y.header = input_cloud->header;
    pub_x_slices.publish(slice_x);
    pub_y_slices.publish(slice_y);
    pub_z_slices.publish(slices_z_pc);
}

NO_OF_SLICES = 20 and NO_OF_SLICES_X = 20

在i5系统上大约需要100ms - 110ms,在运行ubuntu的Odroid(octacore ARM处理器)上需要340ms。我觉得这个处理时间非常长。

我不确定如何优化它以加快运行速度。我还在编译时启用"O3"优化。任何帮助是极大的赞赏。

PS:我也尝试通过3D数组示例point_cloud[640][480][3]循环它,但处理时间略短(约20ms)。

编辑:根据@Toby Speight的建议将代码从for循环的片段更改为整个函数

0 个答案:

没有答案