我正在编写一个cuda内核来将数组复制到另一个数组。它们都在GPU内存中。我不想使用cudamemcpyDeviceToDevice
,因为它的表现不佳。
天真的内核:
__global__ void GpuCopy( float* des , float* __restrict__ sour ,const int M , const int N )
{
int tx=blockIdx.x*blockDim.x+threadIdx.x;
if(tx<N*M)
des[tx]=sour[tx];
}
我认为天真的内核不会获得高性能,所以我尝试使用__shared__
内存,但看起来不太好:
__shared__ float TILE[tile];
int tid=threadIdx.x;
for(int i=0; i<M*N/tile;i++)
{
TILE[tid]=sour[i*tile+tid]
des[i*tile+tid]=TILE[tid]
}
以前的代码段将全局内存复制到des[]
,而后者将全局内存复制到__shared__
,然后将__shared__
复制到des[]
。我认为后者比前者慢。
那么,如何编写__shared__
代码来复制内存?另一个问题是,如果我想使用__const__
内存并且数组(已经在GPU中)比常量内存大,那么如何使用__const__
将其复制到另一个GPU内存?
答案 0 :(得分:1)
对于普通的线性到线性内存复制,共享内存不会给您带来任何好处。你天真的内核应该没问题。在使用较少数量的线程块运行方面可能会有一些小的优化,但在某种程度上调整它将取决于您的特定GPU。
共享内存可以用于执行某种修改复制的内核,例如转置操作。在这些情况下,通过共享存储器的行程成本被改进的合并性能抵消。但是对于你天真的内核,读取和写入都应该合并。
对于单个大型复制操作,cudaMemcpyDeviceToDevice
应该提供非常好的性能,因为单个调用的开销在整个数据移动中分摊。也许你应该考虑两种方法 - 用nvprof
很容易。评论中引用的讨论涉及交换矩阵象限的特定用例。在这种情况下,NxN矩阵需要~1.5N cudaMemcpy
个运算,但正在与单个内核调用进行比较。在这种情况下,API调用设置的开销将开始成为一个重要因素。但是,在将单个cudaMemcpy
操作与单个等效内核调用进行比较时,cudaMemcpy
操作应该很快。
__constant__
内存,因此您必须使用基于cudaMemcpyFromSymbol
和cudaMemcpyToSymbol
的主机代码。
答案 1 :(得分:0)
Robert Crovella已经回答了这个问题。我在这里只是提供一个示例代码来比较CUDA中从设备到设备的两种内存复制方法:
cudaMemcpyDeviceToDevice
; 代码
测试代码如下:
#include <stdio.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCKSIZE 512
/***************/
/* COPY KERNEL */
/***************/
__global__ void copyKernel(const double * __restrict__ d_in, double * __restrict__ d_out, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= N) return;
d_out[tid] = d_in[tid];
}
/********/
/* MAIN */
/********/
int main() {
const int N = 1000000;
TimingGPU timerGPU;
double *h_test = (double *)malloc(N * sizeof(double));
for (int k = 0; k < N; k++) h_test[k] = 1.;
double *d_in; gpuErrchk(cudaMalloc(&d_in, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_in, h_test, N * sizeof(double), cudaMemcpyHostToDevice));
double *d_out; gpuErrchk(cudaMalloc(&d_out, N * sizeof(double)));
timerGPU.StartCounter();
gpuErrchk(cudaMemcpy(d_out, d_in, N * sizeof(double), cudaMemcpyDeviceToDevice));
printf("cudaMemcpy timing = %f [ms]\n", timerGPU.GetCounter());
timerGPU.StartCounter();
copyKernel << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(d_in, d_out, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Copy kernel timing = %f [ms]\n", timerGPU.GetCounter());
return 0;
}
Utilities.cu
和Utilities.cuh
个文件保留here,而。{
TimingGPU.cu
和TimingGPU.cuh
保持here。
时间安排
在GeForce GTX960卡上进行测试。时间安排在ms。
N cudaMemcpyDeviceToDevice copy kernel
1000 0.0075 0.029
10000 0.0078 0.072
100000 0.019 0.068
1000000 0.20 0.22
结果证实了Robert Crovella的猜想:cudaMemcpyDeviceToDevice
通常优于复制内核。
答案 2 :(得分:0)
#include <iostream>
#include <vector>
#include <iomanip>
#include <cuda_runtime.h>
#define CHECK_CUDA(cond) check_cuda(cond, __LINE__)
void check_cuda(cudaError_t status, std::size_t line)
{
if(status != cudaSuccess)
{
std::cout << cudaGetErrorString(status) << '\n';
std::cout << "Line: " << line << '\n';
throw 0;
}
}
__global__ void copy_kernel(float* __restrict__ output, const float* __restrict__ input, int N)
{
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
output[i] = input[i];
}
int main()
{
constexpr int num_trials = 100;
std::vector<int> test_sizes = { 100'000, 1'000'000, 10'000'000, 100'000'000, 250'000'000 };
int grid_size = 0, block_size = 0;
CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, copy_kernel, 0));
std::cout << std::fixed << std::setprecision(4) << std::endl;
for (auto sz : test_sizes)
{
std::cout << "Test Size: " << sz << '\n';
float *d_vector_src = nullptr, *d_vector_dest = nullptr;
CHECK_CUDA(cudaMalloc(&d_vector_src, sz * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_vector_dest, sz * sizeof(float)));
cudaEvent_t start, stop;
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));
float accumulate = 0.0;
for (int i = 0; i < num_trials; i++)
{
CHECK_CUDA(cudaEventRecord(start));
copy_kernel<<<grid_size, block_size>>>(d_vector_dest, d_vector_src, sz);
CHECK_CUDA(cudaEventRecord(stop));
CHECK_CUDA(cudaEventSynchronize(stop));
float current_time = 0;
CHECK_CUDA(cudaEventElapsedTime(¤t_time, start, stop));
accumulate += current_time;
}
std::cout << "\tKernel Copy Time: " << accumulate / num_trials << "ms\n";
accumulate = 0.0;
for (int i = 0; i < num_trials; i++)
{
CHECK_CUDA(cudaEventRecord(start));
CHECK_CUDA(cudaMemcpy(d_vector_dest, d_vector_src, sz * sizeof(float), cudaMemcpyDeviceToDevice));
CHECK_CUDA(cudaEventRecord(stop));
CHECK_CUDA(cudaEventSynchronize(stop));
float current_time = 0;
CHECK_CUDA(cudaEventElapsedTime(¤t_time, start, stop));
accumulate += current_time;
}
std::cout << "\tMemcpy Time: " << accumulate / num_trials << "ms\n";
CHECK_CUDA(cudaFree(d_vector_src));
CHECK_CUDA(cudaFree(d_vector_dest));
}
return 0;
}
GTX 1050移动版
Test Size: 100000
Kernel Copy Time: 0.0118ms
Memcpy Time: 0.0127ms
Test Size: 1000000
Kernel Copy Time: 0.0891ms
Memcpy Time: 0.0899ms
Test Size: 10000000
Kernel Copy Time: 0.8697ms
Memcpy Time: 0.8261ms
Test Size: 100000000
Kernel Copy Time: 8.8871ms
Memcpy Time: 8.2401ms
Test Size: 250000000
Kernel Copy Time: 22.3060ms
Memcpy Time: 20.5419ms
GTX 1080 Ti
Test Size: 100000
Kernel Copy Time: 0.0166ms
Memcpy Time: 0.0188ms
Test Size: 1000000
Kernel Copy Time: 0.0580ms
Memcpy Time: 0.0727ms
Test Size: 10000000
Kernel Copy Time: 0.4674ms
Memcpy Time: 0.5047ms
Test Size: 100000000
Kernel Copy Time: 4.7992ms
Memcpy Time: 3.7722ms
Test Size: 250000000
Kernel Copy Time: 7.2485ms
Memcpy Time: 5.5863ms
Test Size: 1000000000
Kernel Copy Time: 31.5570ms
Memcpy Time: 22.3184ms
RTX 2080 Ti
Test Size: 100000
Kernel Copy Time: 0.0048ms
Memcpy Time: 0.0054ms
Test Size: 1000000
Kernel Copy Time: 0.0193ms
Memcpy Time: 0.0220ms
Test Size: 10000000
Kernel Copy Time: 0.1578ms
Memcpy Time: 0.1537ms
Test Size: 100000000
Kernel Copy Time: 2.1156ms
Memcpy Time: 1.5006ms
Test Size: 250000000
Kernel Copy Time: 5.5195ms
Memcpy Time: 3.7424ms
Test Size: 1000000000
Kernel Copy Time: 23.2106ms
Memcpy Time: 14.9483ms