我编写了一个代码,它获得了一个可能性向量的第一个_var位置(即矩阵_size * _var,_var = 3和_size = 27)并在我的内核中调用此函数(32个线程,即每个都有一个对象) )但我没有得到函数的任何返回值既没有NULL指针。
程序退出时没有错误但是内核中的printf行没有被执行或显示(甚至用sm_20或更高版本编译),好像程序之前停止一样。
的 dataIntern.h:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
template <class a_type>
class dataIntern{
private:
a_type *possibilities;
int _assign;
int _size;
int _var;
int _maxsize;
public:
CUDA_CALLABLE_MEMBER dataIntern(){
}
CUDA_CALLABLE_MEMBER dataIntern(int var){
_var = var;
_size = (int)pow(3.0, (double)_var);
_maxsize = _size * _var;
_assign = 1;
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
if(!possibilities){
exit(1);
}
createTable();
}
CUDA_CALLABLE_MEMBER void createTable(){
int i, j, k, limit, pos;
a_type value;
if(_assign == 1){
for(i=0; i<_var; i++){
#ifdef __CUDA_ARCH__
limit = (int)pow(3.0, _var-i-1);
#else
limit = (int)pow(3.0, (double)_var-i-1);
#endif
value = (a_type)_MIN;
k = 0;
for(j=0; j<_size; j++){
pos = _var*j+i;
if(k >= limit){
value++;
if(value > _MAX){
value = (a_type)_MIN;
}
k = 0;
}
possibilities[pos] = value;
k++;
}
}
}
}
CUDA_CALLABLE_MEMBER void print(){
int i;
printf("Printing.\n");
if(_assign == 1){
for(i=0; i<_size*_var; i++){
printf("%d ", possibilities[i]);
if(i%_var == _var-1){
printf("\n");
}
}
}
else{
printf("Not assigned.\n");
}
}
CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
int i, pos, count, initpos, attrib;
a_type *newnode;
a_type *newlist = NULL, *morelist = NULL;
pos = posChanged;
initpos = 0;
count = 0;
if(_assign == 1){
attrib = 0;
newnode = (a_type*)malloc(_var*sizeof(a_type));
for(i=0; i<_size; i++){
if(possibilities[pos] == valueRetified){
memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));
count++;
if(newlist!=NULL){
morelist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
}
newlist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));
initpos+=_var;
attrib = 1;
}
pos+=_var;
}
if(attrib == 1){
_size = count;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Allocation fail in newlist retify.\n");
exit(1);
}
memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
}
else{
_assign = 0;
}
}
}
CUDA_CALLABLE_MEMBER a_type* unstack(){
a_type* solution = NULL, *backup = NULL;
if(_assign == 1){
if(_size>0){
backup = (a_type*)malloc(_var*_size*sizeof(a_type));
if(backup == NULL){
printf("Erro to alloc backup pointer on unstack function in data intern\n");
return NULL;
}
solution = (a_type*)malloc(_var*sizeof(a_type));
if(solution == NULL){
printf("Erro to alloc solution pointer on unstack function in data intern\n");
return NULL;
}
memcpy(backup, possibilities, _size*_var*sizeof(a_type));
memcpy(solution, possibilities, _var*sizeof(a_type));
free(possibilities);
_size--;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Error to realloc possibilities pointer in data intern\n");
return NULL;
}
memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));
free(backup);
return solution;
}
}
return NULL;
}
CUDA_CALLABLE_MEMBER int get_size(){
return _size;
}
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
};
deviceCode.h:
#ifndef DEVICECODE_H
#define DEVICECODE_H
void CallingInMain();
__global__ void kernel();
#endif
deviceCode.cu:
#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>
//I declared like this to my kernel:
__global__ void kernel(){
__shared__ dataIntern<int> data[32];
int *vetor;
vetor = NULL;
data[threadIdx.x] = dataIntern<int>(3);
//_var == 3 in the class above
vetor = (int*)malloc(sizeof(int)*3);
vetor = data[threadIdx.x].unstack();
while(vetor!=NULL){
//never past here
printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
vetor = data[threadIdx.x].unstack();
}
//neither here in if or else
if(vetor)
printf("Not null\n");
else
printf("Null final\n");
free(vetor);
}
void CallingInMain(){
kernel<<<1, 32>>>();
cudaDeviceSynchronize();
}
main.cu:
#include <iostream>
#include <stdio.h>
#ifndef deviceCode_H
#include "deviceCode.h"
#endif
int main(int argc, char* argv[]){
CallingInMain();
return 0;
}
答案 0 :(得分:1)
有些同事向我指出,你的代码似乎有错误。
在内核中考虑这一行:
data[threadIdx.x] = dataIntern<int>(3);
此行实例化一个临时dataIntern<int>
对象,在其上运行值为3
的构造函数,然后从该对象复制到data[threadIdx.x]
中的存储。请注意,构造函数执行malloc
操作:
CUDA_CALLABLE_MEMBER dataIntern(int var){
...
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
但由于原始对象是临时的,因此C ++标准允许在语句结束时删除对象,即在分号处删除:
data[threadIdx.x] = dataIntern<int>(3);
^
复制构建过程完成后。但删除对象会触发析构函数,析构函数会对free
执行possibilities
操作:
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
因此在这行代码之后使用如此分配的指针:
data[threadIdx.x] = dataIntern<int>(3);
例如在unstack
这里:
vetor = data[threadIdx.x].unstack();
无效。
这违反了C ++编程规则,并且错误并非特定于CUDA。