我刚刚开始学习cuda,我想知道CUDA代码与cpu代码在简单字符串搜索上的表现。
搜索方法:如果数据字符串的前x个字符与之完全相同关键字,然后返回true。(x =>关键字的大小)
有100个关键字和10000个数据字符串。我在这里想要完成的是以并发方式进行比较并比较经过的时间。我写了4个不同的内核和一个cpu代码。然而,我得到的结果相当令人困惑。
searchKeywordKernel:创建4 * 32个线程。每个线程获取一个关键字并将其与10000个数据字符串进行比较,然后将结果写入bool数组。耗时2650ms。
searchKeywordKernel2:创建10 * 1024个线程。每个线程获取一个数据字符串并将其与100个关键字进行比较,然后将结果写入bool数组。花了1397ms。
searchKeywordKernel3:创建1 * 1线程。它的行为类似于cpu代码,生成结果需要279ms。
searchKeywordKernel4:创建977 * 1024个线程。每个线程进行一次字符串比较,花费1334ms。
CPU:进行1000000次字符串比较。花了265毫秒。
我想问几个问题:
为什么searchKeywordKernel3在与cpu代码相似的时间内生成结果?我仔细检查了代码,但没有发现任何问题。
为什么cpu代码与不包括searchKeywordKernel3的内核相比效果更好?是否因为读取操作或数据大小?
硬件信息:显卡:NVidia GT730,处理器:Intel i5-4460。
用于生成结果的代码是:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 10000
#define STRINGSIZE 250
using namespace std;
__global__ void searchKeywordKernel(bool* result, char* data, char* keyword)
{
int keywordStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
int dataStringIndex = 0;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (keywordStringIndex < SEARCHTERMSIZE)
{
for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
}
__global__ void searchKeywordKernel2(bool* result, char* data, char* keyword)
{
int keywordStringIndex = 0;
int dataStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (dataStringIndex < SEARCHITEMSIZE)
{
for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
}
__global__ void searchKeywordKernel3(bool* result, char* data, char* keyword)
{
int keywordStringIndex = 0;
int dataStringIndex = 0;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (threadIdx.x + blockIdx.x * blockDim.x < 1)
{
for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
{
for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
resultIndex++;
}
}
}
}
__global__ void searchKeywordKernel4(bool* result, char* data, char* keyword)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < SEARCHTERMSIZE*SEARCHITEMSIZE)
{
int keywordStringIndex = id / SEARCHITEMSIZE;
int dataStringIndex = id%SEARCHITEMSIZE;
int keywordCharIndex;
int dataCharIndex;
int resultIndex;
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* data = new char[SEARCHITEMSIZE*STRINGSIZE];
int temp = 0;
int dataIndex = 0;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
dataIndex = i*STRINGSIZE;
temp = rand() % (STRINGSIZE-21) + 20;
for (int k = 0; k < temp; k++)
{
data[dataIndex] = 'a';
dataIndex++;
}
data[dataIndex] = '\0';
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" <<endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keyword = new char[SEARCHTERMSIZE*STRINGSIZE];
int keywordIndex = 0;
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
keywordIndex = i*STRINGSIZE;
temp = rand() % (STRINGSIZE - 21) + 20;
for (int k = 0; k < temp; k++)
{
keyword[keywordIndex] = 'a';
keywordIndex++;
}
keyword[keywordIndex] = '\0';
keywordIndex++;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result2 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result3 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result4 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
char* d_data;
char* d_keyword;
bool* d_result;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE);
cudaMalloc(&d_keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
cudaMalloc(&d_result, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
cudaMemcpy(d_data, data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keyword, keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
searchKeywordKernel <<<(SEARCHTERMSIZE/32)+1, 32 >>>(d_result, d_data, d_keyword);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
cudaMemcpy(result, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel2//////////////////////////////////////////
cout << "Before Kernel2" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
searchKeywordKernel2 << < (SEARCHITEMSIZE/1024) +1 , 1024 >> >(d_result, d_data, d_keyword);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Kernel2: " << elapsedTime << "ms" << endl;
////////////////////////Kernel2//////////////////////////////////////////
cudaMemcpy(result2, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel3//////////////////////////////////////////
cout << "Before Kernel3" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
searchKeywordKernel3 << <1, 1 >> >(d_result, d_data, d_keyword);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Kernel3: " << elapsedTime << "ms" << endl;
////////////////////////Kernel3//////////////////////////////////////////
cudaMemcpy(result3, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel4//////////////////////////////////////////
cout << "Before Kernel4" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
searchKeywordKernel4 << <((SEARCHITEMSIZE*SEARCHTERMSIZE)/1024)+1, 1024 >> >(d_result, d_data, d_keyword);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Kernel4: " << elapsedTime << "ms" << endl;
////////////////////////Kernel4//////////////////////////////////////////
cudaMemcpy(result4, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
/*
for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++)
cout << boolalpha << i << " vs " << j << ": " << result4[i*SEARCHITEMSIZE + j] << endl;
cout << "*****************************************" << endl;
}
*/
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
int dataCharIndex = 0;
int keywordCharIndex = 0;
int nonParallelResultIndex = 0;
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE;i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
keywordCharIndex = i*STRINGSIZE;
dataCharIndex = j*STRINGSIZE;
cpuResult[nonParallelResultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
cpuResult[nonParallelResultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
nonParallelResultIndex++;
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
/*
for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++)
cout << boolalpha << i << " vs " << j << ": " << nonParallelResult[i*SEARCHITEMSIZE+j] << endl;
cout << "*****************************************" << endl;
}
*/
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernel1Res, kernel2Res, kernel3Res, kernel4Res;
kernel1Res = true;
kernel2Res = true;
kernel3Res = true;
kernel4Res = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i] && kernel1Res)
kernel1Res = false;
if (cpuResult[i] != result2[i] && kernel2Res)
kernel2Res = false;
if (cpuResult[i] != result3[i] && kernel3Res)
kernel3Res = false;
if (cpuResult[i] != result4[i] && kernel4Res)
kernel4Res = false;
if (!kernel1Res && !kernel2Res && !kernel3Res && !kernel4Res)
break;
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel1 computation: " << kernel1Res << endl;
cout << boolalpha << "Kernel2 computation: " << kernel2Res << endl;
cout << boolalpha << "Kernel3 computation: " << kernel3Res << endl;
cout << boolalpha << "Kernel4 computation: " << kernel4Res << endl;
cout << "Before Deleting arrays" << endl;
delete[] data;
delete[] keyword;
delete[] result;
delete[] result2;
delete[] result3;
delete[] result4;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_data);
cudaFree(d_keyword);
cudaFree(d_result);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
提前谢谢。
答案 0 :(得分:2)
您的代码似乎功能正确 - 这是程序员的第一份工作。那么如何让它运行得更快呢?
CUDA程序员应该注意性能的前两个概念是:
您需要批次的主题。通常我们需要10,000个或更多线程,并且通常没有任何重大损失,因为拥有更多线程。对机器体系结构产生了大量线程的需求 - 它是一个隐藏延迟的机器,它通过拥有可以即时切换的大量工作来隐藏延迟。 &#34;工作&#34;在这种情况下,可以松散地翻译为&#34;线程&#34;。
您希望有效利用内存系统。这可能涉及许多不同的想法,但我们要关注的第一个是合并访问全局内存。 (您并未在任何内核中使用任何共享内存,但如果您使用,我们也希望对共享内存进行非银行冲突访问)。我们还希望在数据使用方面具有效率,最后,与任何计算机优化一样,我们希望利用内存层次结构来查找数据重用机会,并将这些数据项移动到更高的&#34;更高的&#34;内存层次结构中的级别。
那么这对您的代码意味着什么?如果你想写一个&#34;快速&#34;内核,你需要很多线程,并且还要针对100%合并的全局内存负载。因此,内核1和3中的策略看起来不是很好 - 它们根本就没有启动足够的线程。 2更好,但内核4中的策略可能更好 - 它允许我们启动100 * 10000个线程。这符合我们对&#34; lot&#34;的定义。因此,让我们继续使用线程策略,该策略说每个线程将负责生成result
数组的一个元素(因为有100 * 10000个结果)。
现在,关于合并访问,这归结为数据组织。相邻线程如何访问数据?它是连续的吗?在你的kernel4的情况下,它不是。相邻的线程正在从data
读取,其间隙非常大,因为您遍历正在执行工作的while
循环。
要解决此问题,我们可以转置我们的数据。我选择使用数据重用优化:
data
result
项关联的data
个元素。data
的一个元素(字符串),我们可以将该元素(字符串)移动到共享内存中,这样我们每个线程块只读取一次,然后每个线程检索所需的值超出共享内存。这意味着data
中的每个字符串只能从<全局内存中读取一次,这是最佳的。data
以实现最佳合并负载。但是我们仍然需要在keyword
中转换字符串,因为每个线程都将通过全局加载来读取它。我们在这里受益于整个keyword
阵列更小 - 大约25K字节,它可以适合GPU L1缓存(如果可用)或当然适合L2。根据我的测试,通过上述策略和选择,我能够制作出比CPU代码快5倍的内核。由于这个内核很可能在很大程度上限制了带宽,因此我们可能在性能方面处于领先地位。这是一个功能齐全的示例,将您的代码添加到第5个内核中,该内核源自您的第4个内核,但使用keyword
数组的转置形式:
$ cat t703.cu
#include <stdio.h>
#include <iostream>
#include <chrono>
#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 10000
#define STRINGSIZE 250
using namespace std;
__global__ void searchKeywordKernel(bool* result, char* data, char* keyword)
{
int keywordStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
int dataStringIndex = 0;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (keywordStringIndex < SEARCHTERMSIZE)
{
for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
}
__global__ void searchKeywordKernel2(bool* result, char* data, char* keyword)
{
int keywordStringIndex = 0;
int dataStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (dataStringIndex < SEARCHITEMSIZE)
{
for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
}
__global__ void searchKeywordKernel3(bool* result, char* data, char* keyword)
{
int keywordStringIndex = 0;
int dataStringIndex = 0;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (threadIdx.x + blockIdx.x * blockDim.x < 1)
{
for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
{
for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
resultIndex++;
}
}
}
}
__global__ void searchKeywordKernel4(bool* result, char* data, char* keyword)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < SEARCHTERMSIZE*SEARCHITEMSIZE)
{
int keywordStringIndex = id / SEARCHITEMSIZE;
int dataStringIndex = id%SEARCHITEMSIZE;
int keywordCharIndex;
int dataCharIndex;
int resultIndex;
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
// this kernel is a modification of kernel 4, and assumes that the keyword array is transposed
// and that the kernel will be launched with one block per data string, and one thread per keyword
__global__ void searchKeywordKernel5(bool* result, const char * __restrict__ data, const char * keyword)
{
int bid = blockIdx.x;
int tid = threadIdx.x;
__shared__ char sdata[STRINGSIZE];
if (bid < SEARCHITEMSIZE)
{
int my_tid = tid;
while (my_tid < STRINGSIZE){ //load data string to be used by this block into shared mem
sdata[my_tid] = data[bid*STRINGSIZE + my_tid]; //coalesced global load
my_tid += blockDim.x;}
__syncthreads();
if (tid < SEARCHTERMSIZE){
int resultIndex = tid*SEARCHITEMSIZE + bid;
result[resultIndex] = true; //uncoalesced store - could be improved by reorganizing result
char test = keyword[tid]; // coalesced global load
int i = 0;
while (test != '\0')
{
char temp = sdata[i]; // shared memory broadcast
if ((test != temp) || (temp == '\0'))
{
result[resultIndex] = false; //uncoalesced store
break;
}
i++;
test = keyword[i*SEARCHTERMSIZE+tid]; //coalesced global load
}
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* data = new char[SEARCHITEMSIZE*STRINGSIZE];
int temp = 0;
int dataIndex = 0;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
dataIndex = i*STRINGSIZE;
temp = rand() % (STRINGSIZE-21) + 20;
for (int k = 0; k < temp; k++)
{
data[dataIndex] = 'a';
dataIndex++;
}
data[dataIndex] = '\0';
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" <<endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keyword = new char[SEARCHTERMSIZE*STRINGSIZE];
int keywordIndex = 0;
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
keywordIndex = i*STRINGSIZE;
temp = rand() % (STRINGSIZE - 21) + 20;
for (int k = 0; k < temp; k++)
{
keyword[keywordIndex] = 'a';
keywordIndex++;
}
keyword[keywordIndex] = '\0';
keywordIndex++;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result2 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result3 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result4 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result5 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
char* d_data;
char* d_keyword;
char* d_keyword_T;
bool* d_result;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE);
cudaMalloc(&d_keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
cudaMalloc(&d_keyword_T, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
cudaMalloc(&d_result, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_data, data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keyword, keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
//transpose keywords
char* keyword_T = new char[SEARCHTERMSIZE*STRINGSIZE];
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < STRINGSIZE; j++)
keyword_T[j*SEARCHTERMSIZE+i] = keyword[i*STRINGSIZE+j];
cudaMemcpy(d_keyword_T, keyword_T, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel <<<(SEARCHTERMSIZE/32)+1, 32 >>>(d_result, d_data, d_keyword);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
cudaMemcpy(result, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel2//////////////////////////////////////////
cout << "Before Kernel2" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel2 << < (SEARCHITEMSIZE/1024) +1 , 1024 >> >(d_result, d_data, d_keyword);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel2: " << elapsedTime << "ms" << endl;
////////////////////////Kernel2//////////////////////////////////////////
cudaMemcpy(result2, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel3//////////////////////////////////////////
cout << "Before Kernel3" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel3 << <1, 1 >> >(d_result, d_data, d_keyword);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel3: " << elapsedTime << "ms" << endl;
////////////////////////Kernel3//////////////////////////////////////////
cudaMemcpy(result3, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel4//////////////////////////////////////////
cout << "Before Kernel4" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel4 << <((SEARCHITEMSIZE*SEARCHTERMSIZE)/1024)+1, 1024 >> >(d_result, d_data, d_keyword);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel4: " << elapsedTime << "ms" << endl;
////////////////////////Kernel4//////////////////////////////////////////
cudaMemcpy(result4, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
cudaFuncSetCacheConfig(searchKeywordKernel5, cudaFuncCachePreferL1);
////////////////////////Kernel5//////////////////////////////////////////
cout << "Before Kernel5" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel5 << <SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_result, d_data, d_keyword_T);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel5: " << elapsedTime << "ms" << endl;
////////////////////////Kernel5//////////////////////////////////////////
cudaMemcpy(result5, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
/*
for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++)
cout << boolalpha << i << " vs " << j << ": " << result4[i*SEARCHITEMSIZE + j] << endl;
cout << "*****************************************" << endl;
}
*/
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
int dataCharIndex = 0;
int keywordCharIndex = 0;
int nonParallelResultIndex = 0;
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE;i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
keywordCharIndex = i*STRINGSIZE;
dataCharIndex = j*STRINGSIZE;
cpuResult[nonParallelResultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
cpuResult[nonParallelResultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
nonParallelResultIndex++;
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
/*
for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++)
cout << boolalpha << i << " vs " << j << ": " << nonParallelResult[i*SEARCHITEMSIZE+j] << endl;
cout << "*****************************************" << endl;
}
*/
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernel1Res, kernel2Res, kernel3Res, kernel4Res, kernel5Res;
kernel1Res = true;
kernel2Res = true;
kernel3Res = true;
kernel4Res = true;
kernel5Res = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i] && kernel1Res)
kernel1Res = false;
if (cpuResult[i] != result2[i] && kernel2Res)
kernel2Res = false;
if (cpuResult[i] != result3[i] && kernel3Res)
kernel3Res = false;
if (cpuResult[i] != result4[i] && kernel4Res)
kernel4Res = false;
if (cpuResult[i] != result5[i] && kernel5Res)
kernel5Res = false;
if (!kernel1Res && !kernel2Res && !kernel3Res && !kernel4Res && !kernel5Res)
break;
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel1 computation: " << kernel1Res << endl;
cout << boolalpha << "Kernel2 computation: " << kernel2Res << endl;
cout << boolalpha << "Kernel3 computation: " << kernel3Res << endl;
cout << boolalpha << "Kernel4 computation: " << kernel4Res << endl;
cout << boolalpha << "Kernel5 computation: " << kernel5Res << endl;
cout << "Before Deleting arrays" << endl;
delete[] data;
delete[] keyword;
delete[] result;
delete[] result2;
delete[] result3;
delete[] result4;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_data);
cudaFree(d_keyword);
cudaFree(d_result);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
return 0;
}
$ nvcc -O3 -std=c++11 -o t703 t703.cu
$ ./t703
Before Search Data Init
After Search Data Init: 0ms
Before Search Keyword Init
After Search Keyword Init: 0ms
Before Malloc
After Malloc: 38ms
Before Memcpy
After Memcpy: 1.09805ms
Before Kernel
After Kernel: 1455.98ms
Before Kernel2
After Kernel2: 110.16ms
Before Kernel3
After Kernel3: 363.236ms
Before Kernel4
After Kernel4: 96.9751ms
Before Kernel5
After Kernel5: 10.9064ms
CPU code starts
CPU code ends: 76ms
Kernel1 computation: true
Kernel2 computation: true
Kernel3 computation: true
Kernel4 computation: true
Kernel5 computation: true
Before Deleting arrays
After Deleting arrays
Before Freeing device memory
After Freeing device memory
$
一些注意事项:
-G
(调试)开关(这是Visual Studio在调试CUDA项目上执行的操作)编译CUDA代码可能会对代码性能产生重大影响。无论何时对CUDA代码进行基准测试或分析以获得性能,都不应使用-G
开关。