Question

现在，我正在尝试为某些任务的循环运行非常大的代码，大约进行了8e + 12次迭代。我尝试使用c ++ 11线程，但是它似乎没有按照要求的那样快地工作。我正在使用带有8 GB内存，i5 cpu和Intel图形4000卡的系统。如果我使用openmp会更好，还是必须使用nvidia gpu并使用cuda来完成此任务？我的代码如下：

#include <ros/ros.h>
// PCL specific includes
#include <sensor_msgs/PointCloud2.h>
#include <pcl_conversions/pcl_conversions.h>
#include <pcl/point_types.h>
#include <pcl/filters/voxel_grid.h>
#include <visualization_msgs/Marker.h>
#include <rosbag/bag.h>
#include <std_msgs/Int32.h>
#include <rosbag/view.h>
#include <boost/foreach.hpp>
#define foreach BOOST_FOREACH
#include <fstream>

#include <pcl/point_cloud.h>
#include <pcl/octree/octree_pointcloud_changedetector.h>
#include <pcl/io/pcd_io.h>
#include <iostream>
#include <vector>
#include <ctime>
#include <thread>
ros::Publisher marker_publisher;
int frame_index = 0;
using namespace std;
int x[200000];
void thread_function(pcl::PointCloud<pcl::PointXYZRGB>::ConstPtr cloudB,vector<int> v,int p0) {
    for(size_t p1=0;p1<v.size() && ros::ok();++p1) {
        int p0p1 = sqrt( pow(cloudB->points[v[p1]].x-cloudB->points[v[p0]].x,2)                
        +pow(cloudB->points[v[p1]].y-cloudB->points[v[p0]].y,2)
        +pow(cloudB->points[v[p1]].z-cloudB->points[v[p0]].z,2) ) * 1000;
        if(p0p1>10) {
            for(size_t p2=0;p2<v.size() && ros::ok();++p2) {
                int p0p2 = sqrt( pow(cloudB->points[v[p2]].x-cloudB->points[v[p0]].x,2)                
                +pow(cloudB->points[v[p2]].y-cloudB->points[v[p0]].y,2)
                +pow(cloudB->points[v[p2]].z-cloudB->points[v[p0]].z,2) ) * 1000;
                int p1p2 = sqrt( pow(cloudB->points[v[p2]].x-cloudB->points[v[p1]].x,2)                
                +pow(cloudB->points[v[p2]].y-cloudB->points[v[p1]].y,2)
                +pow(cloudB->points[v[p2]].z-cloudB->points[v[p1]].z,2) ) * 1000;
                if(p0p2>10 && p1p2>10) {
                }    
            }    
        }       
    }    
    x[p0] = 3;
    cout<<"ended thread="<<p0<<endl;
}

void cloud_cb (const sensor_msgs::PointCloud2ConstPtr& input)
{
    frame_index++;
    pcl::PointCloud<pcl::PointXYZRGB>::Ptr cloudB (new pcl::PointCloud<pcl::PointXYZRGB> );
    pcl::fromROSMsg(*input,*cloudB);

    // Initializing Marker parameters which will be used in rviz
    vector<visualization_msgs::Marker> line_list, marker, text_view_facing;
    line_list.resize(4); marker.resize(4); text_view_facing.resize(4);
    for(int i=0;i<line_list.size();i++) {
        marker[i].header.frame_id = line_list[i].header.frame_id = text_view_facing[i].header.frame_id = "/X3/base_link";
        marker[i].header.stamp = line_list[i].header.stamp = text_view_facing[i].header.stamp =ros::Time();
        marker[i].ns = line_list[i].ns = text_view_facing[i].ns ="lines";
        marker[i].action = line_list[i].action = text_view_facing[i].action = visualization_msgs::Marker::ADD;
        marker[i].pose.orientation.w = line_list[i].pose.orientation.w = text_view_facing[i].pose.orientation.w = 1;
        marker[i].id = i+4;
        line_list[i].id = i;
        marker[i].type = visualization_msgs::Marker::POINTS;
        line_list[i].type = visualization_msgs::Marker::LINE_LIST;
        line_list[i].color.r = 1; line_list[i].color.g = 1; line_list[i].color. b = 1; line_list[i].color.a = 1;
        marker[i].scale.x = 0.003;
        marker[i].scale.y = 0.003;
        marker[i].scale.z = 0.003;
        text_view_facing[i].id = i+8;
        text_view_facing[i].type = visualization_msgs::Marker::TEXT_VIEW_FACING;
        text_view_facing[i].color.b = 1; text_view_facing[i].color.a = 1.0; text_view_facing[i].color.g = 1.0; text_view_facing[i].color.r = 1.0;
        text_view_facing[i].scale.z = 0.015;
    }
    marker[3].scale.x = 0.05;
    marker[3].scale.y = 0.05;
    marker[3].scale.z = 0.05;

    if(frame_index==10) // Saving the point cloud for only one time to find moved object in it
    {
      pcl::io::savePCDFileASCII ("test_pcd.pcd", *cloudB);
    }
    if(frame_index>10) // Reading above point cloud file after saving for once to compare it with newly arriving point clouds
    {
        pcl::PointCloud<pcl::PointXYZRGB>::Ptr cloud (new pcl::PointCloud<pcl::PointXYZRGB>);

        if (pcl::io::loadPCDFile<pcl::PointXYZRGB> ("test_pcd.pcd", *cloud) == -1) //* load the file
        {
            PCL_ERROR ("Couldn't read file test_pcd.pcd \n");
        }
        else {

            srand ((unsigned int) time (NULL));

            // Octree resolution - side length of octree voxels
            double resolution = 0.1;

            // Instantiate octree-based point cloud change detection class
            pcl::octree::OctreePointCloudChangeDetector<pcl::PointXYZRGB> octree (resolution);

            // Add points from cloudA to octree
            octree.setInputCloud (cloud);
            octree.addPointsFromInputCloud ();

            // Switch octree buffers: This resets octree but keeps previous tree structure in memory.
            octree.switchBuffers ();

            // Add points from cloudB to octree
            octree.setInputCloud (cloudB);
            octree.addPointsFromInputCloud ();

            std::vector<int> newPointIdxVector;

            // Get vector of point indices from octree voxels which did not exist in previous buffer
            octree.getPointIndicesFromNewVoxels (newPointIdxVector);

            geometry_msgs::Point p; std_msgs::ColorRGBA c;
            for (size_t i = 0; i < newPointIdxVector.size (); ++i)
            {                        
                p.x = cloudB->points[newPointIdxVector[i]].x;
                p.y = cloudB->points[newPointIdxVector[i]].y;
                p.z = cloudB->points[newPointIdxVector[i]].z;
                c.r = cloudB->points[newPointIdxVector[i]].r/255.0; 
                c.g = cloudB->points[newPointIdxVector[i]].g/255.0; 
                c.b = cloudB->points[newPointIdxVector[i]].b/255.0; 
                c.a = 1;

                //cout<<newPointIdxVector.size()<<"\t"<<p.x<<"\t"<<p.y<<"\t"<<p.z<<endl;   

                if(!isnan(p.x) && !isnan(p.y) && !isnan(p.z)) {

                    marker[3].points.push_back(p);
                    marker[3].colors.push_back(c);
                }
            }
            marker_publisher.publish(marker[3]);

            pcl::PointCloud<pcl::PointXYZRGB> P;
            thread t[newPointIdxVector.size()];

            for(int p0=0;p0<newPointIdxVector.size();++p0) { // For each voxel in moved object 
                t[p0] = thread(thread_function,cloudB,newPointIdxVector,p0);  
            }   
            for(int p0=0;p0<newPointIdxVector.size();++p0) { // For each voxel in moved object 
                t[p0].join();
                cout<<"joined"<<"\t"<<p0<<"\t"<<x[p0]<<endl;
            }
        } 

    }    


}

int main (int argc, char** argv)
{
  ros::init (argc, argv, "training");
  ros::NodeHandle nh;

  ros::Subscriber sub = nh.subscribe<sensor_msgs::PointCloud2> ("input", 1, cloud_cb);
  marker_publisher = nh.advertise<visualization_msgs::Marker> ("visualization_marker",1);

  // Spin
  ros::spin ();
}

这个任务对我的算法完成非常重要。我需要一个建议，以使此循环运行得非常快。在上面的代码中，thread_function是主要功能，其中我当前正在放置for循环。他们有什么办法提高上述代码的性能吗？

Answer 1

OpenMP是最容易实现和尝试的。只需在您的CMakeLists.txt文件中添加几行，一个包含和就在for循环之前的著名的#pragma omp parallel for行。

Answer 2

线程本身不一定是速度的保证。如果您的过程主要是线性的，则无需并行执行任何操作。在您的情况下，看起来您有一个循环，并且每个迭代都可以并行地独立完成，但是由于每个循环是如此之小，并且大多数情况下是简单的数学运算，因此使每个项目成为自己的线程的开销可能不会为您节省很多（如果有）的时间。算法本身可能需要检修（即以完全不同的方式执行），但是如果您的循环很大，线程可以潜在地解决您的问题，您可以将其分成4个块并并行处理这4个块（即一个线程）执行0-100，另一个101-200等）。请注意，一个进程可能先于另一个进程完成，并且如果某个其他进程依赖于整个数据集的完成，那么您需要确保在完成所有4个线程之前都已完成。而且，如果您在并行进程中对数据进行了任何形式的操作（即，移位元素，添加，删除），那么最终可能会搞砸并行线程。希望有帮助！

如何提高大型for循环的速度

2 个答案: