现在,我正在尝试为某些任务的循环运行非常大的代码,大约进行了8e + 12次迭代。我尝试使用c ++ 11线程,但是它似乎没有按照要求的那样快地工作。我正在使用带有8 GB内存,i5 cpu和Intel图形4000卡的系统。如果我使用openmp会更好,还是必须使用nvidia gpu并使用cuda来完成此任务?我的代码如下:
#include <ros/ros.h>
// PCL specific includes
#include <sensor_msgs/PointCloud2.h>
#include <pcl_conversions/pcl_conversions.h>
#include <pcl/point_types.h>
#include <pcl/filters/voxel_grid.h>
#include <visualization_msgs/Marker.h>
#include <rosbag/bag.h>
#include <std_msgs/Int32.h>
#include <rosbag/view.h>
#include <boost/foreach.hpp>
#define foreach BOOST_FOREACH
#include <fstream>
#include <pcl/point_cloud.h>
#include <pcl/octree/octree_pointcloud_changedetector.h>
#include <pcl/io/pcd_io.h>
#include <iostream>
#include <vector>
#include <ctime>
#include <thread>
ros::Publisher marker_publisher;
int frame_index = 0;
using namespace std;
int x[200000];
void thread_function(pcl::PointCloud<pcl::PointXYZRGB>::ConstPtr cloudB,vector<int> v,int p0) {
for(size_t p1=0;p1<v.size() && ros::ok();++p1) {
int p0p1 = sqrt( pow(cloudB->points[v[p1]].x-cloudB->points[v[p0]].x,2)
+pow(cloudB->points[v[p1]].y-cloudB->points[v[p0]].y,2)
+pow(cloudB->points[v[p1]].z-cloudB->points[v[p0]].z,2) ) * 1000;
if(p0p1>10) {
for(size_t p2=0;p2<v.size() && ros::ok();++p2) {
int p0p2 = sqrt( pow(cloudB->points[v[p2]].x-cloudB->points[v[p0]].x,2)
+pow(cloudB->points[v[p2]].y-cloudB->points[v[p0]].y,2)
+pow(cloudB->points[v[p2]].z-cloudB->points[v[p0]].z,2) ) * 1000;
int p1p2 = sqrt( pow(cloudB->points[v[p2]].x-cloudB->points[v[p1]].x,2)
+pow(cloudB->points[v[p2]].y-cloudB->points[v[p1]].y,2)
+pow(cloudB->points[v[p2]].z-cloudB->points[v[p1]].z,2) ) * 1000;
if(p0p2>10 && p1p2>10) {
}
}
}
}
x[p0] = 3;
cout<<"ended thread="<<p0<<endl;
}
void cloud_cb (const sensor_msgs::PointCloud2ConstPtr& input)
{
frame_index++;
pcl::PointCloud<pcl::PointXYZRGB>::Ptr cloudB (new pcl::PointCloud<pcl::PointXYZRGB> );
pcl::fromROSMsg(*input,*cloudB);
// Initializing Marker parameters which will be used in rviz
vector<visualization_msgs::Marker> line_list, marker, text_view_facing;
line_list.resize(4); marker.resize(4); text_view_facing.resize(4);
for(int i=0;i<line_list.size();i++) {
marker[i].header.frame_id = line_list[i].header.frame_id = text_view_facing[i].header.frame_id = "/X3/base_link";
marker[i].header.stamp = line_list[i].header.stamp = text_view_facing[i].header.stamp =ros::Time();
marker[i].ns = line_list[i].ns = text_view_facing[i].ns ="lines";
marker[i].action = line_list[i].action = text_view_facing[i].action = visualization_msgs::Marker::ADD;
marker[i].pose.orientation.w = line_list[i].pose.orientation.w = text_view_facing[i].pose.orientation.w = 1;
marker[i].id = i+4;
line_list[i].id = i;
marker[i].type = visualization_msgs::Marker::POINTS;
line_list[i].type = visualization_msgs::Marker::LINE_LIST;
line_list[i].color.r = 1; line_list[i].color.g = 1; line_list[i].color. b = 1; line_list[i].color.a = 1;
marker[i].scale.x = 0.003;
marker[i].scale.y = 0.003;
marker[i].scale.z = 0.003;
text_view_facing[i].id = i+8;
text_view_facing[i].type = visualization_msgs::Marker::TEXT_VIEW_FACING;
text_view_facing[i].color.b = 1; text_view_facing[i].color.a = 1.0; text_view_facing[i].color.g = 1.0; text_view_facing[i].color.r = 1.0;
text_view_facing[i].scale.z = 0.015;
}
marker[3].scale.x = 0.05;
marker[3].scale.y = 0.05;
marker[3].scale.z = 0.05;
if(frame_index==10) // Saving the point cloud for only one time to find moved object in it
{
pcl::io::savePCDFileASCII ("test_pcd.pcd", *cloudB);
}
if(frame_index>10) // Reading above point cloud file after saving for once to compare it with newly arriving point clouds
{
pcl::PointCloud<pcl::PointXYZRGB>::Ptr cloud (new pcl::PointCloud<pcl::PointXYZRGB>);
if (pcl::io::loadPCDFile<pcl::PointXYZRGB> ("test_pcd.pcd", *cloud) == -1) //* load the file
{
PCL_ERROR ("Couldn't read file test_pcd.pcd \n");
}
else {
srand ((unsigned int) time (NULL));
// Octree resolution - side length of octree voxels
double resolution = 0.1;
// Instantiate octree-based point cloud change detection class
pcl::octree::OctreePointCloudChangeDetector<pcl::PointXYZRGB> octree (resolution);
// Add points from cloudA to octree
octree.setInputCloud (cloud);
octree.addPointsFromInputCloud ();
// Switch octree buffers: This resets octree but keeps previous tree structure in memory.
octree.switchBuffers ();
// Add points from cloudB to octree
octree.setInputCloud (cloudB);
octree.addPointsFromInputCloud ();
std::vector<int> newPointIdxVector;
// Get vector of point indices from octree voxels which did not exist in previous buffer
octree.getPointIndicesFromNewVoxels (newPointIdxVector);
geometry_msgs::Point p; std_msgs::ColorRGBA c;
for (size_t i = 0; i < newPointIdxVector.size (); ++i)
{
p.x = cloudB->points[newPointIdxVector[i]].x;
p.y = cloudB->points[newPointIdxVector[i]].y;
p.z = cloudB->points[newPointIdxVector[i]].z;
c.r = cloudB->points[newPointIdxVector[i]].r/255.0;
c.g = cloudB->points[newPointIdxVector[i]].g/255.0;
c.b = cloudB->points[newPointIdxVector[i]].b/255.0;
c.a = 1;
//cout<<newPointIdxVector.size()<<"\t"<<p.x<<"\t"<<p.y<<"\t"<<p.z<<endl;
if(!isnan(p.x) && !isnan(p.y) && !isnan(p.z)) {
marker[3].points.push_back(p);
marker[3].colors.push_back(c);
}
}
marker_publisher.publish(marker[3]);
pcl::PointCloud<pcl::PointXYZRGB> P;
thread t[newPointIdxVector.size()];
for(int p0=0;p0<newPointIdxVector.size();++p0) { // For each voxel in moved object
t[p0] = thread(thread_function,cloudB,newPointIdxVector,p0);
}
for(int p0=0;p0<newPointIdxVector.size();++p0) { // For each voxel in moved object
t[p0].join();
cout<<"joined"<<"\t"<<p0<<"\t"<<x[p0]<<endl;
}
}
}
}
int main (int argc, char** argv)
{
ros::init (argc, argv, "training");
ros::NodeHandle nh;
ros::Subscriber sub = nh.subscribe<sensor_msgs::PointCloud2> ("input", 1, cloud_cb);
marker_publisher = nh.advertise<visualization_msgs::Marker> ("visualization_marker",1);
// Spin
ros::spin ();
}
这个任务对我的算法完成非常重要。我需要一个建议,以使此循环运行得非常快。 在上面的代码中,thread_function是主要功能,其中我当前正在放置for循环。他们有什么办法提高上述代码的性能吗?
答案 0 :(得分:0)
OpenMP是最容易实现和尝试的。只需在您的CMakeLists.txt文件中添加几行,一个包含和就在for循环之前的著名的#pragma omp parallel for
行。
答案 1 :(得分:0)
线程本身不一定是速度的保证。如果您的过程主要是线性的,则无需并行执行任何操作。在您的情况下,看起来您有一个循环,并且每个迭代都可以并行地独立完成,但是由于每个循环是如此之小,并且大多数情况下是简单的数学运算,因此使每个项目成为自己的线程的开销可能不会为您节省很多(如果有)的时间。算法本身可能需要检修(即以完全不同的方式执行),但是如果您的循环很大,线程可以潜在地解决您的问题,您可以将其分成4个块并并行处理这4个块(即一个线程)执行0-100,另一个101-200等)。请注意,一个进程可能先于另一个进程完成,并且如果某个其他进程依赖于整个数据集的完成,那么您需要确保在完成所有4个线程之前都已完成。而且,如果您在并行进程中对数据进行了任何形式的操作(即,移位元素,添加,删除),那么最终可能会搞砸并行线程。希望有帮助!