在我的opencl内核中,我试图将比其他部分更多访问的部分数据复制到本地内存中。为此,在开始时我只是手动指定值。复制如下:
__global LinearBVHInteriorNode* subtree_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes[subtree_interior_idx]);
__global LinearBVHInteriorNodeInfo* subtree_info_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes_info[subtree_interior_idx]);
int last_subtree_interior_idx = subtree_info_globalptr->last_interior;
int n_subtree_interior_nodes = last_subtree_interior_idx - subtree_interior_idx + 1;
int interior_nodes_beg_idx = thread_local_idx * ((n_subtree_interior_nodes / local_dim) + 1);
int interior_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_interior_nodes / local_dim) + 1), last_subtree_interior_idx);
if(thread_local_idx == 0){
subtree.interior_base_idx = subtree_info_globalptr[1].parent;
subtree.leaf_base_idx = subtree_info_globalptr[0].leaf_lowidx;
subtree.tree = accelstruct;
}
// Copy the data for interior nodes from the global memory to local memory
for(int i = interior_nodes_beg_idx; i <= interior_nodes_end_idx; i++){
subtree.interior_nodes[ i ].left_bound.min = subtree_globalptr[ i ].left_bound.min;
subtree.interior_nodes[ i ].left_bound.max = subtree_globalptr[ i ].left_bound.max;
subtree.interior_nodes[ i ].right_bound.min = subtree_globalptr[ i ].right_bound.min;
subtree.interior_nodes[ i ].right_bound.max = subtree_globalptr[ i ].right_bound.max;
subtree.interior_nodes[ i ].children[0] = subtree_globalptr[ i ].children[0];
subtree.interior_nodes[ i ].children[1] = subtree_globalptr[ i ].children[1];
subtree.interior_nodes[ i ].splitAxis = subtree_globalptr[ i ].splitAxis;
}
int leafnodes_lowidx = subtree_info_globalptr->leaf_lowidx;
int leafnodes_highidx = subtree_info_globalptr->leaf_highidx;
int n_subtree_leaf_nodes = leafnodes_highidx - leafnodes_lowidx;
__global LinearBVHLeafNode* subtree_leaf_globalptr = &(((__global BVHTree*)accelstruct)->leaf_nodes[leafnodes_lowidx]);
int leaf_nodes_beg_idx = thread_local_idx * ((n_subtree_leaf_nodes / local_dim) + 1);
int leaf_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_leaf_nodes / local_dim) + 1), leafnodes_highidx);
// Copy the data for leaf nodes from the global memory to local memory
for(int i = leaf_nodes_beg_idx; i < leaf_nodes_end_idx; i++){
subtree.leaf_nodes[ i ].lowIdx = subtree_leaf_globalptr[ i ].lowIdx;
subtree.leaf_nodes[ i ].highIdx = subtree_leaf_globalptr[ i ].highIdx;
}
// Wait all the threads to finish the copying task.
barrier(CLK_LOCAL_MEM_FENCE);
我的数据集花了0.1秒将数据从全局内存复制到本地内存太多了。据我所知,async_work_group_copy
应该比手动复制更快地执行,因此我更改为async_work_group_copy
:
__global LinearBVHInteriorNode* subtree_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes[subtree_interior_idx]);
__global LinearBVHInteriorNodeInfo* subtree_info_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes_info[subtree_interior_idx]);
int last_subtree_interior_idx = subtree_info_globalptr->last_interior;
int n_subtree_interior_nodes = last_subtree_interior_idx - subtree_interior_idx + 1;
int interior_nodes_beg_idx = thread_local_idx * ((n_subtree_interior_nodes / local_dim) + 1);
int interior_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_interior_nodes / local_dim) + 1), last_subtree_interior_idx);
if(thread_local_idx == 0){
subtree.interior_base_idx = subtree_info_globalptr[1].parent;
subtree.leaf_base_idx = subtree_info_globalptr[0].leaf_lowidx;
subtree.tree = accelstruct;
}
event_t copy_events[2];
// Copy the data for interior nodes from the global memory to local memory
copy_events[0] = async_work_group_copy((__local long2*)(&subtree.interior_nodes[ interior_nodes_beg_idx ]), (__global long2 *)(&subtree_globalptr[ interior_nodes_beg_idx ]), (interior_nodes_end_idx - interior_nodes_beg_idx + 1) * 5, 0);
int leafnodes_lowidx = subtree_info_globalptr->leaf_lowidx;
int leafnodes_highidx = subtree_info_globalptr->leaf_highidx;
int n_subtree_leaf_nodes = leafnodes_highidx - leafnodes_lowidx;
__global LinearBVHLeafNode* subtree_leaf_globalptr = &(((__global BVHTree*)accelstruct)->leaf_nodes[leafnodes_lowidx]);
int leaf_nodes_beg_idx = thread_local_idx * ((n_subtree_leaf_nodes / local_dim) + 1);
int leaf_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_leaf_nodes / local_dim) + 1), leafnodes_highidx);
// Copy the data for leaf nodes from the global memory to local memory
copy_events[1] = async_work_group_copy((__local long*)&subtree.interior_nodes[ leaf_nodes_beg_idx ], (__global long*)&subtree_globalptr[ leaf_nodes_beg_idx ], (leaf_nodes_end_idx - leaf_nodes_beg_idx), 0);
// Wait all the threads to finish the copying task.
barrier(CLK_LOCAL_MEM_FENCE);
但它在1.0s
执行复制,这很奇怪。我使用的结构定义如下:
typedef struct {
AABB left_bound, right_bound;
int children[2];
ushort splitAxis;
int pad;
} LinearBVHInteriorNode;
typedef struct {
int parent;
int leaf_lowidx, leaf_highidx;
int last_interior;
} LinearBVHInteriorNodeInfo;
typedef struct {
int lowIdx, highIdx;
} LinearBVHLeafNode;
typedef struct {
int parent;
} LinearBVHLeafNodeInfo;
typedef struct {
__global LinearBVHInteriorNode* interior_nodes; //!< a pointer to interior nodes
__global LinearBVHInteriorNodeInfo* interior_nodes_info; //!< a pointer to information of interior nodes
__global LinearBVHLeafNode* leaf_nodes; //!< a pointer to leaf nodes
__global LinearBVHLeafNodeInfo* leaf_nodes_info; //!< a pointer to information of leaf nodes
uint n_interior_nodes; //!< Number of interior nodes
uint n_leaf_nodes; //!< Number of leaf nodes
} BVHTree;
typedef struct {
__local LinearBVHInteriorNode* interior_nodes;
__local LinearBVHLeafNode* leaf_nodes;
int interior_base_idx, leaf_base_idx;
__global BVHTree* tree;
} local_BVHTree;
我在opencl内核中寻找像memcpy
这样的函数。因为两个数组都具有相同的结构,并且可以使用这样的指令进行复制。我认为async_work_group_copy
正在做类似的事情。有人遇到过同样的问题吗?