我使用的是Cudafy.NET,我对BlockSize有些困难。在某些情况下会产生不同的结果。差不多就在这里:
//correct results when using this line
gpu.Launch(1, 7, "kernelfx_alldata", 10, devdata, devnmin, devnmax, devgmin, devgmax, devtest);
//incorrect results when using this line
gpu.Launch(1, 8, "kernelfx_alldata", 10, devdata, devnmin, devnmax, devgmin, devgmax, devtest);
关于问题的详细解释:
我有10个要循环的项目。 GridSize是1。
CASE 1 :当CudafyModes.Target = eGPUType。 OpenCL 且BlockSize 1,2,3,4,5,6和7 。结果正确。
CASE 2 :CudafyModes.Target = eGPUType。 OpenCL ,BlockSize 8,9,10,11,.... 和更多。结果不正确。
CASE 3 :CudafyModes.Target = eGPUType。模拟器,BlockSize 1,2,3,4,5,6,7,8, 9,10,11,......等等。结果是 的正确
示例代码如下所示。 初始化变量:
double[,] data;
double[] nmin, nmax, gmin, gmax;
void initializeVars()
{
data = new double[10, 10];
for (int i = 0; i < 10; i++)
{
data[i, 0] = 100 + i;
data[i, 1] = 32 + i;
data[i, 2] = 22 + i;
data[i, 3] = -20 - i;
data[i, 4] = 5522 + 10 * i;
data[i, 5] = 40 + i;
data[i, 6] = 14 - i;
data[i, 7] = 12 + i;
data[i, 8] = -10 + i;
data[i, 9] = 10 + 10 * i;
}
nmin = new double[10];
nmax= new double[10];
gmin = new double[10];
gmax = new double[10];
for (int i = 0; i < 10; i++)
{
nmin[i] = -1;
nmax[i] = 1;
gmin[i] = i;
gmax[i] = 11 * i*i+1;
}
}
gpu启动代码:
private void button1_Click(object sender, EventArgs e)
{
CudafyModes.Target = eGPUType.OpenCL;
CudafyModes.DeviceId = 0;
CudafyTranslator.Language = eLanguage.OpenCL;
CudafyModule km = CudafyTranslator.Cudafy();
Cudafy.Host.GPGPU gpu = Cudafy.Host.CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
gpu.LoadModule(km);
initializeVars();
double[,] devdata = gpu.Allocate<double>(data); gpu.CopyToDevice(data, devdata);
double[] devnmin = gpu.Allocate<double>(nmin); gpu.CopyToDevice(nmin, devnmin);
double[] devnmax = gpu.Allocate<double>(nmax); gpu.CopyToDevice(nmax, devnmax);
double[] devgmin = gpu.Allocate<double>(gmin); gpu.CopyToDevice(gmin, devgmin);
double[] devgmax = gpu.Allocate<double>(gmax); gpu.CopyToDevice(gmax, devgmax);
double[] test = new double[10];
double[] devtest = gpu.Allocate<double>(test);
gpu.Launch(1, 8, "kernelfx_alldata", 10, devdata, devnmin,
devnmax, devgmin, devgmax, devtest);
gpu.CopyFromDevice(devtest, test);
gpu.FreeAll();
}
Cudafy内核
[Cudafy]
public static void kernelfx_alldata(GThread thread, int N, double[,] data, double[] nmin, double[] nmax, double[] gmin, double[] gmax, double[] test)
{
int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
while (tid < N)
{
double[] tmp = thread.AllocateShared<double>("tmp", 10);
tmp[0] = 1;
for (int i = 1; i < 10; i++)
{
tmp[i] = data[tid, i - 1];
}
for (int i = 1; i < 10; i++)
{
tmp[i] = (nmax[i - 1] - nmin[i - 1]) / (gmax[i - 1] - gmin[i - 1]) * (tmp[i] - gmin[i - 1]) + nmin[i - 1];
}
test[tid] = tmp[1];
tid = tid + thread.blockDim.x * thread.gridDim.x;
}
}
正确(案例1和案例3)结果是:
test[0]=199.0 test[1]=201.0 test[2]=203.0 test[3]=205.0 test[4]=207.0 test[5]=209.0 test[6]=211.0 test[7]=213.0 test[8]=215.0 test[9]=217.0
错误(CASE 2)的结果是:
test[0]=213.0 test[1]=213.0 test[2]=213.0 test[3]=213.0 test[4]=213.0 test[5]=213.0 test[6]=213.0 test[7]=213.0 test[8]=217.0 test[9]=217.0
当BlockSize低于8时,结果是正确的。但是当BlockSize大于8时,结果不正确。为了有效地使用gpu,blockSize必须大于8。
此代码有什么问题?
最诚挚的问候......
答案 0 :(得分:1)
将tmp声明为2d数组,第一列是threadId解决问题。 工作代码如下:
[Cudafy]
public static void kernelfx_alldata(GThread thread, int N, double[,] data, double[] nmin,
double[] nmax, double[] gmin, double[] gmax, double[] test)
{
int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
double[,] tmp = thread.AllocateShared<double>("tmp", 10, 10);
while (tid < N)
{
tmp[tid, 0] = 1;
for (int i = 1; i < 10; i++)
{
tmp[tid, i] = data[tid, i - 1];
}
for (int i = 1; i < 10; i++)
{
tmp[tid, i] = (nmax[i - 1] - nmin[i - 1]) / (gmax[i - 1] - gmin[i - 1]) * (tmp[tid, i] - gmin[i - 1]) + nmin[i - 1];
}
test[tid] = tmp[tid, 1];
tid = tid + thread.blockDim.x * thread.gridDim.x;
}
}