设备代码中的错误memcpy

时间:2015-10-20 23:22:37

标签: cuda memcpy

我编写了一个代码,它获得了一个可能性向量的第一个_var位置(即矩阵_size * _var,_var = 3和_size = 27)并在我的内核中调用此函数(32个线程,即每个都有一个对象) )但我没有得到函数的任何返回值既没有NULL指针。
程序退出时没有错误但是内核中的printf行没有被执行或显示(甚至用sm_20或更高版本编译),好像程序之前停止一样。
dataIntern.h:

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1

#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif

template <class a_type>
class dataIntern{
private:
    a_type *possibilities;
    int _assign;
    int _size;
    int _var;
    int _maxsize;

public:
    CUDA_CALLABLE_MEMBER dataIntern(){
    }

    CUDA_CALLABLE_MEMBER dataIntern(int var){
        _var = var;
        _size = (int)pow(3.0, (double)_var);
        _maxsize = _size * _var;
        _assign = 1;
        possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
        if(!possibilities){
            exit(1);
        }
        createTable();
    }

    CUDA_CALLABLE_MEMBER void createTable(){
        int i, j, k, limit, pos;
        a_type value;
        if(_assign == 1){
            for(i=0; i<_var; i++){
                #ifdef __CUDA_ARCH__
                    limit = (int)pow(3.0, _var-i-1);
                #else
                    limit = (int)pow(3.0, (double)_var-i-1);
                #endif

                value = (a_type)_MIN;
                k = 0;
                for(j=0; j<_size; j++){
                    pos = _var*j+i;
                    if(k >= limit){
                        value++;
                        if(value > _MAX){
                            value = (a_type)_MIN;
                        }
                        k = 0;
                    }
                    possibilities[pos] = value;
                    k++;
                }
            }           
        }
    }

    CUDA_CALLABLE_MEMBER void print(){
        int i;

        printf("Printing.\n");
        if(_assign == 1){
            for(i=0; i<_size*_var; i++){
                printf("%d ", possibilities[i]);
                if(i%_var == _var-1){
                    printf("\n");
                }
            }
        }
        else{
            printf("Not assigned.\n");
        }
    }

    CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
        int i, pos, count, initpos, attrib;
        a_type *newnode;
        a_type *newlist = NULL, *morelist = NULL;
        pos = posChanged;
        initpos = 0;
        count = 0;      

        if(_assign == 1){
            attrib = 0;
            newnode = (a_type*)malloc(_var*sizeof(a_type));
            for(i=0; i<_size; i++){
                if(possibilities[pos] == valueRetified){
                    memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));

                    count++;
                    if(newlist!=NULL){
                        morelist = (a_type*)malloc(count*_var*sizeof(a_type));
                        memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
                    }
                    newlist = (a_type*)malloc(count*_var*sizeof(a_type));
                    memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
                    memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));

                    initpos+=_var;
                    attrib = 1;
                }
                pos+=_var;
            }

            if(attrib == 1){
                _size = count;
                possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
                if(possibilities == NULL){
                    printf("Allocation fail in newlist retify.\n");
                    exit(1);
                }
                memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
            }
            else{
                _assign = 0;
            }
        }
    }

    CUDA_CALLABLE_MEMBER a_type* unstack(){
        a_type* solution = NULL, *backup = NULL;

        if(_assign == 1){
            if(_size>0){
                backup = (a_type*)malloc(_var*_size*sizeof(a_type));
                if(backup == NULL){
                    printf("Erro to alloc backup pointer on unstack function in data intern\n");
                    return NULL;
                }

                solution = (a_type*)malloc(_var*sizeof(a_type));
                if(solution == NULL){
                    printf("Erro to alloc solution pointer on unstack function in data intern\n");
                    return NULL;
                }
                memcpy(backup, possibilities, _size*_var*sizeof(a_type));                   
                memcpy(solution, possibilities, _var*sizeof(a_type));

                free(possibilities);

                _size--;
                possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
                if(possibilities == NULL){
                    printf("Error to realloc possibilities pointer in data intern\n");
                    return NULL;
                }
                memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));

                free(backup);
                return solution;
            }           
        }
        return NULL;
    }

    CUDA_CALLABLE_MEMBER int get_size(){
        return _size;
    }

    CUDA_CALLABLE_MEMBER ~dataIntern(){
        _assign = 0;
        if(possibilities)
            free(possibilities);
    }
};

deviceCode.h:

#ifndef DEVICECODE_H
#define DEVICECODE_H

void CallingInMain();
__global__ void kernel();
#endif

deviceCode.cu:

#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>


//I declared like this to my kernel:
__global__ void kernel(){
    __shared__ dataIntern<int> data[32];
    int *vetor;

    vetor = NULL;
    data[threadIdx.x] = dataIntern<int>(3);

    //_var == 3 in the class above
    vetor = (int*)malloc(sizeof(int)*3);

    vetor = data[threadIdx.x].unstack();
    while(vetor!=NULL){
        //never past here
        printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
        vetor = data[threadIdx.x].unstack();
    }
    //neither here in if or else
    if(vetor)
        printf("Not null\n");
    else
        printf("Null final\n");

    free(vetor);
}

void CallingInMain(){
    kernel<<<1, 32>>>();
    cudaDeviceSynchronize();
}

main.cu:

#include <iostream>
#include <stdio.h>

#ifndef deviceCode_H
#include "deviceCode.h"
#endif

int main(int argc, char* argv[]){


    CallingInMain();

    return 0;
}

1 个答案:

答案 0 :(得分:1)

有些同事向我指出,你的代码似乎有错误。

在内核中考虑这一行:

data[threadIdx.x] = dataIntern<int>(3);

此行实例化一个临时dataIntern<int>对象,在其上运行值为3的构造函数,然后从该对象复制到data[threadIdx.x]中的存储。请注意,构造函数执行malloc操作:

CUDA_CALLABLE_MEMBER dataIntern(int var){
    ...
    possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));

但由于原始对象是临时的,因此C ++标准允许在语句结束时删除对象,即在分号处删除:

data[threadIdx.x] = dataIntern<int>(3);
                                      ^
复制构建过程完成后

。但删除对象会触发析构函数,析构函数会对free执行possibilities操作:

CUDA_CALLABLE_MEMBER ~dataIntern(){
    _assign = 0;
    if(possibilities)
        free(possibilities);
}

因此在这行代码之后使用如此分配的指针:

data[threadIdx.x] = dataIntern<int>(3);

例如在unstack这里:

vetor = data[threadIdx.x].unstack();

无效。

这违反了C ++编程规则,并且错误并非特定于CUDA。