这是我在下面发布的内核的可视化分析器结果。注意网格(1)和块(1024)的大小以及当它应该使用64个寄存器时它如何仅使用43个寄存器。我正在使用特斯拉k40c。
#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>
#include <algorithm>
#include <iostream>
#define iterations 159744
#define transMatrixSize 2592 // Just for clarity. Do not change.
#define reps 1024 // Is equal to blocksize. Do not change
#define integralStep 13125 // Number of time steps to be averaged at the tail
__global__ void
__launch_bounds__(1024,1)
bufferleech(float *masterForces, float *masterForces50, const float * __restrict__ transMatrix, const float *rands, const int r_max)
{
int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x));
curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);
float r = 0.0;
volatile __shared__ float buffer[reps];
volatile __shared__ float buffer50[reps];
int RU[26] = {0};
for(int e =1; e< 25; e++)
{
r = curand_uniform(&s);
if(r < .5)
{
RU[e] += 10;
}
}
int index = 0;
float temp = 0;
float temp50 = 0;
int RUsnapshot = 0;
int leftsnap = 0;
int RUsnapshot50 = 0;
int leftsnap50 = 0;
for (int i =0; i < iterations; i++)
{
leftsnap = 0;
leftsnap50 = 0;
/////////////////////////////////////////XYZ: [100% state][50%Binary][50% state]//////////////////////////////////////
for(int j = 1; j < 25; j++)
{
r = curand_uniform(&s);
RUsnapshot = int(RU[j]/100);
//index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ;
index = ((((leftsnap * 6 + int(RU[j+1]/100)) * 6 + int(RU[j]/100)) * 2) * 6) ;
RU[j]+= 100 * (( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5])) ;
leftsnap = RUsnapshot;
//-------------------------------------xTnC 50----------------------------
RUsnapshot50 = (RU[j] % 10);
//index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4 + xTnC?) * dimen5) ;
index = ((((leftsnap50 * 6 + (RU[j+1] % 10)) * 6 + (RU[j] % 10) ) * 2 + int((RU[j] % 100)/10)) * 6) ;
RU[j]+= ( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;
leftsnap50 = RUsnapshot50;
}
///////////////////////////////////////////////////////////
for(int z = 1; z < 25; z++)
{
temp+= ((int(RU[z]/100.0)) ==4) + ((int(RU[z]/100.0)) ==5);
temp50+= ((RU[z] % 10) ==4) + ((RU[z] % 10) ==5);
}
buffer[globalIdx] = temp;
buffer50[globalIdx] = temp50;
__syncthreads();
for (int b = 0; b < 10; b++)
{
if ((globalIdx % int(powf(2, (b+1)))) == 0)
{
buffer[globalIdx] += buffer[globalIdx + int(powf(2,b))];
buffer50[globalIdx] += buffer50[globalIdx + int(powf(2,b))];
if(b ==9)
{
masterForces[i] = buffer[0]/24576.0;
masterForces50[i] = buffer50[0]/24576.0;
}
}
}
temp = 0.0;
temp50 = 0.0;
}
}
如何让这个内核使用64个寄存器? 肯定有更多寄存器使用的空间,因为我编写的类似内核使用launch bounds命令可以获得多达116个寄存器。
由于
如果您想自己运行它,这是主要功能:
int main()
{
srand((unsigned)time(NULL));
cudaSetDevice(0);
cudaStream_t s6;
cudaStreamCreate(&s6);
float tm[transMatrixSize] = {0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000,
0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701,
2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000,
0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205,
2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0,
0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315,
1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000,
0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649,
-2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000,
0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571,
1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0,
0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0,
0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1,
0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0,
0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1,
0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0,
0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0,
0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0,
0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0,
0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513,
-2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000,
0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043,
2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1,
0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0,
0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000,
0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660,
-2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000,
0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088,
-2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0,
0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0,
0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000,
0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861,
-1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000,
0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0};
float *h_F6 = new float[(iterations)];
float *h_F50_6 = new float[(iterations)];
float h_S6 [reps]; for (int i = 0; i < reps; i++) {h_S6 [i] = 0;}; for (int i = 0; i < reps; i++){h_S6 [i] = float(rand());}
// Device input vectors
float *d_F6 ;
float *d_F50_6 ;
float *d_S6 ;
float *d_TM6 ;
// Allocate memory for each vector on GPU
cudaMalloc((void**)&d_F6 , iterations * sizeof(float));
cudaMalloc((void**)&d_F50_6 , iterations * sizeof(float));
cudaMalloc((void**)&d_S6 , reps * sizeof(float));
cudaMalloc((void**)&d_TM6 , transMatrixSize * sizeof(float));
// Copy host vectors to device
cudaMemcpyAsync( d_S6 , h_S6 , reps * sizeof(float), cudaMemcpyHostToDevice, s6 );
cudaMemcpyAsync( d_TM6 , tm , transMatrixSize * sizeof(float), cudaMemcpyHostToDevice, s6 );
bufferleech<<<1, 1024, 0, s6 >>>( d_F6 , d_F50_6 , d_TM6 , d_S6 , reps);
cudaMemcpyAsync( h_F6 , d_F6 , iterations * sizeof(float), cudaMemcpyDeviceToHost, s6 );
cudaMemcpyAsync( h_F50_6 , d_F50_6 , iterations * sizeof(float), cudaMemcpyDeviceToHost, s6 );
delete h_F6; delete h_F50_6;
cudaDeviceReset();
return 0;
}
答案 0 :(得分:1)
launch bounds(间接)每个线程的寄存器上限。它不会强制编译器为每个线程使用特定数量的寄存器。
如果编译器每个线程只需要46个寄存器,则启动边界不会强制它使用更多。
你写的“类似”内核使用更多内核的事实并没有真正说明什么。内核代码中无害的变化会导致寄存器的使用大不相同。
如果您希望程序运行得更快,请使用多个线程块。这是一个非常重要的优化目标,而不是努力增加每线程使用的寄存器。
如果您愿意,可以使用binary utilities检查生成的计算机代码。然而,除了确认它之外,这对理解每个线程使用的寄存器并不具有指导意义。