无法从纹理内存中读出值

时间:2012-09-04 13:17:38

标签: cuda textures gpu gpu-programming

您好我正在编写一个简单的程序来练习使用纹理内存。我只想将我的数据写入纹理存储器并将其写回全局存储器。但我不会读出价值观。这是代码。

#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"

#include "linearInterpolation_kernel4.cu"

using namespace std;
using std::cout;

const int blocksize = 16; 

__global__ 
void hello(char *a, int *b) {
    a[threadIdx.x] += b[threadIdx.x];
}



////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
    if( cudaSuccess != err) {
        printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );

    }
}

// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)

inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) {
        printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );

    }
}

int main()
{
    int N = 40; 
    float *A; 
    A = (float *) malloc(N*sizeof(float));
    float *B;
    B = (float *) malloc(N*sizeof(float));
    float *result;
    result = (float *) malloc(N*sizeof(float));
    float angle = 0.8f; 

    for(int i = 0; i < N; i++){
        A[i] = i; //(float)rand();
        B[i] = i+1; //(float)rand();
    }
    ipLinearTexture2(A,B,result,angle,N);

    float result2;

    result2 = (angle)*A[4] + (1-angle)*B[4]; 

    printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
    cout << result2 << endl;

    return 1;
}

void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
    float cuTime;

    int N2 = N * 2;
    float *dev_result;

    float **AB;

    AB = (float **) malloc( N * sizeof(float *));

    if(AB)
    {
        for(int i = 0; i < N; i++)
        {
            AB[i] = (float *) malloc( 2 * sizeof(float *));
        }
    }

    for (int i = 0; i < N; i = i++)
    {
        AB[i][0] = A[i];
        AB[i][1] = B[i];
    }

    cudaMalloc(&dev_result, N * sizeof(float));
    unsigned int size = N2 * sizeof(float);

    //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    cudaArray* cu_array;

    checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2)); 
    cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);

    // set texture parameters
    tex2.normalized = true;  
    tex2.filterMode = cudaFilterModeLinear;
    tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
    tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;

    checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));

    dim3 dimBlock(10, 1, 1);
    dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);

    transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);

    checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
    cout << "==================================================" << endl;

    for (int i = 0 ; i < N ;i++)
    {
        cout << result[i] << " on " << i << endl;   
    }

    cout << "==================================================" << endl;
    checkCudaErrors(cudaUnbindTexture(tex));
    checkCudaErrors(cudaFree(dev_result));
    checkCudaErrors(cudaFreeArray(cu_array));
}

这是内核代码

#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_

// Texture references

texture<float, 2, cudaReadModeElementType> tex2;

__global__ void
transformKernel4(float* g_odata, int width, int height, float theta) 
{
    unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;

    if (xid >= width || yid >= height) return; 

    float dx = 1.0f / (float)width;
    float dy = 1.0f / (float)height;

    float x = ((float)xid + 0.5f) * dx;
    float y = ((float)yid + 0.5f) * dy;


        float value = tex2D(tex2, x , y);
        printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;

    }
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_

有人能说出我做错了什么吗? 我编辑了它以消除前两个逻辑错误。为什么我需要能够打印出我的数据呢?

2 个答案:

答案 0 :(得分:1)

这是数组的错误绑定。您不能在C中使用可以复制的多维数组。您必须使用一个代表多维的onedimensional数组。

答案 1 :(得分:0)

我在这里可以看到2个逻辑错误。

第一个是@asm指出的那个。 应通过计算2D x和y指数的线性指数来存储输出。

outputIndex = yid * width + xid;

第二个是cudaArray结构的内存分配是内部对齐的。 您应该考虑使用cudaMemcpy2DToArray函数来避免错误的数据复制。

cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);