Question

我刚刚开始学习cuda，我想知道CUDA代码与cpu代码在简单字符串搜索上的表现。

搜索方法：如果数据字符串的前x个字符与之完全相同关键字，然后返回true。（x =＆gt;关键字的大小）

有100个关键字和10000个数据字符串。我在这里想要完成的是以并发方式进行比较并比较经过的时间。我写了4个不同的内核和一个cpu代码。然而，我得到的结果相当令人困惑。

searchKeywordKernel：创建4 * 32个线程。每个线程获取一个关键字并将其与10000个数据字符串进行比较，然后将结果写入bool数组。耗时2650ms。

searchKeywordKernel2：创建10 * 1024个线程。每个线程获取一个数据字符串并将其与100个关键字进行比较，然后将结果写入bool数组。花了1397ms。

searchKeywordKernel3：创建1 * 1线程。它的行为类似于cpu代码，生成结果需要279ms。

searchKeywordKernel4：创建977 * 1024个线程。每个线程进行一次字符串比较，花费1334ms。

CPU：进行1000000次字符串比较。花了265毫秒。

我想问几个问题：

为什么searchKeywordKernel3在与cpu代码相似的时间内生成结果？我仔细检查了代码，但没有发现任何问题。

为什么cpu代码与不包括searchKeywordKernel3的内核相比效果更好？是否因为读取操作或数据大小？

硬件信息：显卡：NVidia GT730，处理器：Intel i5-4460。

用于生成结果的代码是：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>

#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 10000
#define STRINGSIZE 250

using namespace std;

__global__ void searchKeywordKernel(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
    int dataStringIndex = 0;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;  
    int resultIndex = 0;

    if (keywordStringIndex < SEARCHTERMSIZE)
    {
        for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
        {
            dataCharIndex = dataStringIndex*STRINGSIZE;
            keywordCharIndex = keywordStringIndex*STRINGSIZE;
            resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
            result[resultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    result[resultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
        }
    }   
}
__global__ void searchKeywordKernel2(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = 0;
    int dataStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;
    int resultIndex = 0;

    if (dataStringIndex < SEARCHITEMSIZE)
    {
        for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
        {
            dataCharIndex = dataStringIndex*STRINGSIZE;
            keywordCharIndex = keywordStringIndex*STRINGSIZE;
            resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
            result[resultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    result[resultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
        }
    }
}
__global__ void searchKeywordKernel3(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = 0;
    int dataStringIndex = 0;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;
    int resultIndex = 0;

    if (threadIdx.x + blockIdx.x * blockDim.x < 1)
    {
        for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
        {
            for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
            {
                dataCharIndex = dataStringIndex*STRINGSIZE;
                keywordCharIndex = keywordStringIndex*STRINGSIZE;
                result[resultIndex] = true;
                while (keyword[keywordCharIndex] != '\0')
                {
                    if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                    {
                        result[resultIndex] = false;
                        break;
                    }
                    keywordCharIndex++;
                    dataCharIndex++;
                }
                resultIndex++;
            }
        }
    }
}
__global__ void searchKeywordKernel4(bool* result, char* data, char* keyword)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < SEARCHTERMSIZE*SEARCHITEMSIZE)
    {
        int keywordStringIndex = id / SEARCHITEMSIZE;
        int dataStringIndex = id%SEARCHITEMSIZE;
        int keywordCharIndex;
        int dataCharIndex;
        int resultIndex;

        dataCharIndex = dataStringIndex*STRINGSIZE;
        keywordCharIndex = keywordStringIndex*STRINGSIZE;
        resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
        result[resultIndex] = true;
        while (keyword[keywordCharIndex] != '\0')
        {
            if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
            {
                result[resultIndex] = false;
                break;
            }
            keywordCharIndex++;
            dataCharIndex++;
        }       
    }
}

int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();
    char* data = new char[SEARCHITEMSIZE*STRINGSIZE];
    int temp = 0;
    int dataIndex = 0;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        dataIndex = i*STRINGSIZE;
        temp = rand() % (STRINGSIZE-21) + 20;
        for (int k = 0; k < temp; k++)
        {           
            data[dataIndex] = 'a';
            dataIndex++;
        }
        data[dataIndex] = '\0';
    }           
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" <<endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();
    char* keyword = new char[SEARCHTERMSIZE*STRINGSIZE];
    int keywordIndex = 0;
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        keywordIndex = i*STRINGSIZE;
        temp = rand() % (STRINGSIZE - 21) + 20;
        for (int k = 0; k < temp; k++)
        {
            keyword[keywordIndex] = 'a';
            keywordIndex++;
        }
        keyword[keywordIndex] = '\0';
        keywordIndex++;
    }   
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count()  << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result2 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result3 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result4 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    char* d_data;
    char* d_keyword;
    bool* d_result;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE);
    cudaMalloc(&d_keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
    cudaMalloc(&d_result, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_data, data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keyword, keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////



    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    searchKeywordKernel <<<(SEARCHTERMSIZE/32)+1, 32 >>>(d_result, d_data, d_keyword);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    cudaMemcpy(result, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel2//////////////////////////////////////////
    cout << "Before Kernel2" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    searchKeywordKernel2 << < (SEARCHITEMSIZE/1024) +1 , 1024 >> >(d_result, d_data, d_keyword);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Kernel2: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel2//////////////////////////////////////////

    cudaMemcpy(result2, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel3//////////////////////////////////////////
    cout << "Before Kernel3" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    searchKeywordKernel3 << <1, 1 >> >(d_result, d_data, d_keyword);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Kernel3: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel3//////////////////////////////////////////

    cudaMemcpy(result3, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel4//////////////////////////////////////////
    cout << "Before Kernel4" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    searchKeywordKernel4 << <((SEARCHITEMSIZE*SEARCHTERMSIZE)/1024)+1, 1024 >> >(d_result, d_data, d_keyword);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Kernel4: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel4//////////////////////////////////////////

    cudaMemcpy(result4, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    /*
    for (int i = 0; i < 10; i++)
    {
        for (int j = 0; j < 10; j++)
            cout << boolalpha << i << " vs " << j << ": " << result4[i*SEARCHITEMSIZE + j] << endl;
        cout << "*****************************************" << endl;
    }
    */
    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    int dataCharIndex = 0;
    int keywordCharIndex = 0;
    int nonParallelResultIndex = 0;

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE;i++)
    {   
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            keywordCharIndex = i*STRINGSIZE;
            dataCharIndex = j*STRINGSIZE;
            cpuResult[nonParallelResultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    cpuResult[nonParallelResultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
            nonParallelResultIndex++;
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////
    /*
    for (int i = 0; i < 10; i++)
    {
        for (int j = 0; j < 10; j++)
            cout << boolalpha << i << " vs " << j << ": " << nonParallelResult[i*SEARCHITEMSIZE+j] << endl;
        cout << "*****************************************" << endl;
    }   
    */
    ////////////////////////////////////Result Comparison////////////////////////////////////////
    bool kernel1Res, kernel2Res, kernel3Res, kernel4Res;

    kernel1Res = true;
    kernel2Res = true;
    kernel3Res = true;
    kernel4Res = true;

    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i] && kernel1Res)
            kernel1Res = false;
        if (cpuResult[i] != result2[i] && kernel2Res)
            kernel2Res = false;
        if (cpuResult[i] != result3[i] && kernel3Res)
            kernel3Res = false;
        if (cpuResult[i] != result4[i] && kernel4Res)
            kernel4Res = false;
        if (!kernel1Res && !kernel2Res && !kernel3Res && !kernel4Res)
            break;      
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel1 computation: " << kernel1Res << endl;
    cout << boolalpha << "Kernel2 computation: " << kernel2Res << endl;
    cout << boolalpha << "Kernel3 computation: " << kernel3Res << endl;
    cout << boolalpha << "Kernel4 computation: " << kernel4Res << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] data;
    delete[] keyword;
    delete[] result;
    delete[] result2;
    delete[] result3;
    delete[] result4;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_data);
    cudaFree(d_keyword);    
    cudaFree(d_result);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    system("pause");
    return 0;
}

提前谢谢。

Answer 1

您的代码似乎功能正确 - 这是程序员的第一份工作。那么如何让它运行得更快呢？

CUDA程序员应该注意性能的前两个概念是：

您需要批次的主题。通常我们需要10,000个或更多线程，并且通常没有任何重大损失，因为拥有更多线程。对机器体系结构产生了大量线程的需求 - 它是一个隐藏延迟的机器，它通过拥有可以即时切换的大量工作来隐藏延迟。＆＃34;工作＆＃34;在这种情况下，可以松散地翻译为＆＃34;线程＆＃34;。
您希望有效利用内存系统。这可能涉及许多不同的想法，但我们要关注的第一个是合并访问全局内存。（您并未在任何内核中使用任何共享内存，但如果您使用，我们也希望对共享内存进行非银行冲突访问）。我们还希望在数据使用方面具有效率，最后，与任何计算机优化一样，我们希望利用内存层次结构来查找数据重用机会，并将这些数据项移动到更高的＆＃34;更高的＆＃34;内存层次结构中的级别。

那么这对您的代码意味着什么？如果你想写一个＆＃34;快速＆＃34;内核，你需要很多线程，并且还要针对100％合并的全局内存负载。因此，内核1和3中的策略看起来不是很好 - 它们根本就没有启动足够的线程。 2更好，但内核4中的策略可能更好 - 它允许我们启动100 * 10000个线程。这符合我们对＆＃34; lot＆＃34;的定义。因此，让我们继续使用线程策略，该策略说每个线程将负责生成result数组的一个元素（因为有100 * 10000个结果）。

现在，关于合并访问，这归结为数据组织。相邻线程如何访问数据？它是连续的吗？在你的kernel4的情况下，它不是。相邻的线程正在从data读取，其间隙非常大，因为您遍历正在执行工作的while循环。

要解决此问题，我们可以转置我们的数据。我选择使用数据重用优化：

指定每个线程块以处理data
分配线程块中的每个线程，以处理与步骤1中result项关联的data个元素。
由于每个threadblock只处理data的一个元素（字符串），我们可以将该元素（字符串）移动到共享内存中，这样我们每个线程块只读取一次，然后每个线程检索所需的值超出共享内存。这意味着data中的每个字符串只能从<全局内存中读取一次，这是最佳的。
由于步骤3中的优化选择，我们可以避免转换data以实现最佳合并负载。但是我们仍然需要在keyword中转换字符串，因为每个线程都将通过全局加载来读取它。我们在这里受益于整个keyword阵列更小 - 大约25K字节，它可以适合GPU L1缓存（如果可用）或当然适合L2。

根据我的测试，通过上述策略和选择，我能够制作出比CPU代码快5倍的内核。由于这个内核很可能在很大程度上限制了带宽，因此我们可能在性能方面处于领先地位。这是一个功能齐全的示例，将您的代码添加到第5个内核中，该内核源自您的第4个内核，但使用keyword数组的转置形式：

$ cat t703.cu
#include <stdio.h>
#include <iostream>
#include <chrono>

#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 10000
#define STRINGSIZE 250

using namespace std;

__global__ void searchKeywordKernel(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
    int dataStringIndex = 0;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;  
    int resultIndex = 0;

    if (keywordStringIndex < SEARCHTERMSIZE)
    {
        for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
        {
            dataCharIndex = dataStringIndex*STRINGSIZE;
            keywordCharIndex = keywordStringIndex*STRINGSIZE;
            resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
            result[resultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    result[resultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
        }
    }   
}
__global__ void searchKeywordKernel2(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = 0;
    int dataStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;
    int resultIndex = 0;

    if (dataStringIndex < SEARCHITEMSIZE)
    {
        for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
        {
            dataCharIndex = dataStringIndex*STRINGSIZE;
            keywordCharIndex = keywordStringIndex*STRINGSIZE;
            resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
            result[resultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    result[resultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
        }
    }
}
__global__ void searchKeywordKernel3(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = 0;
    int dataStringIndex = 0;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;
    int resultIndex = 0;

    if (threadIdx.x + blockIdx.x * blockDim.x < 1)
    {
        for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
        {
            for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
            {
                dataCharIndex = dataStringIndex*STRINGSIZE;
                keywordCharIndex = keywordStringIndex*STRINGSIZE;
                result[resultIndex] = true;
                while (keyword[keywordCharIndex] != '\0')
                {
                    if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                    {
                        result[resultIndex] = false;
                        break;
                    }
                    keywordCharIndex++;
                    dataCharIndex++;
                }
                resultIndex++;
            }
        }
    }
}
__global__ void searchKeywordKernel4(bool* result, char* data, char* keyword)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < SEARCHTERMSIZE*SEARCHITEMSIZE)
    {
        int keywordStringIndex = id / SEARCHITEMSIZE;
        int dataStringIndex = id%SEARCHITEMSIZE;
        int keywordCharIndex;
        int dataCharIndex;
        int resultIndex;

        dataCharIndex = dataStringIndex*STRINGSIZE;
        keywordCharIndex = keywordStringIndex*STRINGSIZE;
        resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
        result[resultIndex] = true;
        while (keyword[keywordCharIndex] != '\0')
        {
            if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
            {
                result[resultIndex] = false;
                break;
            }
            keywordCharIndex++;
            dataCharIndex++;
        }       
    }
}

// this kernel is a modification of kernel 4, and assumes that the keyword array is transposed
// and that the kernel will be launched with one block per data string, and one thread per keyword

__global__ void searchKeywordKernel5(bool* result, const char  * __restrict__ data,  const char * keyword)
{
    int bid = blockIdx.x;
    int tid = threadIdx.x;
    __shared__ char sdata[STRINGSIZE];
    if (bid < SEARCHITEMSIZE)
    {
        int my_tid = tid;
        while (my_tid < STRINGSIZE){  //load data string to be used by this block into shared mem
          sdata[my_tid] = data[bid*STRINGSIZE + my_tid]; //coalesced global load
          my_tid += blockDim.x;}
        __syncthreads();
        if (tid < SEARCHTERMSIZE){
          int resultIndex = tid*SEARCHITEMSIZE + bid;
          result[resultIndex] = true; //uncoalesced store - could be improved by reorganizing result
          char test = keyword[tid]; // coalesced global load
          int i = 0;
          while (test != '\0')
          {
            char temp = sdata[i]; // shared memory broadcast
            if ((test != temp) || (temp == '\0'))
            {
                result[resultIndex] = false; //uncoalesced store
                break;
            }
            i++;
            test = keyword[i*SEARCHTERMSIZE+tid]; //coalesced global load
          }
        }        
    }
}


int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();
    char* data = new char[SEARCHITEMSIZE*STRINGSIZE];
    int temp = 0;
    int dataIndex = 0;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        dataIndex = i*STRINGSIZE;
        temp = rand() % (STRINGSIZE-21) + 20;
        for (int k = 0; k < temp; k++)
        {           
            data[dataIndex] = 'a';
            dataIndex++;
        }
        data[dataIndex] = '\0';
    }           
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" <<endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();
    char* keyword = new char[SEARCHTERMSIZE*STRINGSIZE];
    int keywordIndex = 0;
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        keywordIndex = i*STRINGSIZE;
        temp = rand() % (STRINGSIZE - 21) + 20;
        for (int k = 0; k < temp; k++)
        {
            keyword[keywordIndex] = 'a';
            keywordIndex++;
        }
        keyword[keywordIndex] = '\0';
        keywordIndex++;
    }   
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count()  << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    bool* result  = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result2 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result3 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result4 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result5 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    char* d_data;
    char* d_keyword;
    char* d_keyword_T;
    bool* d_result;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE);
    cudaMalloc(&d_keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
    cudaMalloc(&d_keyword_T, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
    cudaMalloc(&d_result, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_data, data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keyword, keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);

    //transpose keywords
    char* keyword_T = new char[SEARCHTERMSIZE*STRINGSIZE];
    for (int i = 0; i < SEARCHTERMSIZE; i++)
      for (int j = 0; j < STRINGSIZE; j++)
        keyword_T[j*SEARCHTERMSIZE+i] = keyword[i*STRINGSIZE+j];

    cudaMemcpy(d_keyword_T, keyword_T, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);


    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////



    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    searchKeywordKernel <<<(SEARCHTERMSIZE/32)+1, 32 >>>(d_result, d_data, d_keyword);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    cudaMemcpy(result, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel2//////////////////////////////////////////
    cout << "Before Kernel2" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    searchKeywordKernel2 << < (SEARCHITEMSIZE/1024) +1 , 1024 >> >(d_result, d_data, d_keyword);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel2: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel2//////////////////////////////////////////

    cudaMemcpy(result2, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel3//////////////////////////////////////////
    cout << "Before Kernel3" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    searchKeywordKernel3 << <1, 1 >> >(d_result, d_data, d_keyword);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel3: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel3//////////////////////////////////////////

    cudaMemcpy(result3, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel4//////////////////////////////////////////
    cout << "Before Kernel4" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    searchKeywordKernel4 << <((SEARCHITEMSIZE*SEARCHTERMSIZE)/1024)+1, 1024 >> >(d_result, d_data, d_keyword);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel4: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel4//////////////////////////////////////////

    cudaMemcpy(result4, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);


    cudaFuncSetCacheConfig(searchKeywordKernel5, cudaFuncCachePreferL1);

    ////////////////////////Kernel5//////////////////////////////////////////
    cout << "Before Kernel5" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    searchKeywordKernel5 << <SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_result, d_data, d_keyword_T);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel5: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel5//////////////////////////////////////////

    cudaMemcpy(result5, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    /*
    for (int i = 0; i < 10; i++)
    {
        for (int j = 0; j < 10; j++)
            cout << boolalpha << i << " vs " << j << ": " << result4[i*SEARCHITEMSIZE + j] << endl;
        cout << "*****************************************" << endl;
    }
    */
    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    int dataCharIndex = 0;
    int keywordCharIndex = 0;
    int nonParallelResultIndex = 0;

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE;i++)
    {   
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            keywordCharIndex = i*STRINGSIZE;
            dataCharIndex = j*STRINGSIZE;
            cpuResult[nonParallelResultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    cpuResult[nonParallelResultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
            nonParallelResultIndex++;
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////
    /*
    for (int i = 0; i < 10; i++)
    {
        for (int j = 0; j < 10; j++)
            cout << boolalpha << i << " vs " << j << ": " << nonParallelResult[i*SEARCHITEMSIZE+j] << endl;
        cout << "*****************************************" << endl;
    }   
    */
    ////////////////////////////////////Result Comparison////////////////////////////////////////
    bool kernel1Res, kernel2Res, kernel3Res, kernel4Res, kernel5Res;

    kernel1Res = true;
    kernel2Res = true;
    kernel3Res = true;
    kernel4Res = true;
    kernel5Res = true;

    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i] && kernel1Res)
            kernel1Res = false;
        if (cpuResult[i] != result2[i] && kernel2Res)
            kernel2Res = false;
        if (cpuResult[i] != result3[i] && kernel3Res)
            kernel3Res = false;
        if (cpuResult[i] != result4[i] && kernel4Res)
            kernel4Res = false;
        if (cpuResult[i] != result5[i] && kernel5Res)
            kernel5Res = false;
        if (!kernel1Res && !kernel2Res && !kernel3Res && !kernel4Res && !kernel5Res)
            break;      
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel1 computation: " << kernel1Res << endl;
    cout << boolalpha << "Kernel2 computation: " << kernel2Res << endl;
    cout << boolalpha << "Kernel3 computation: " << kernel3Res << endl;
    cout << boolalpha << "Kernel4 computation: " << kernel4Res << endl;
    cout << boolalpha << "Kernel5 computation: " << kernel5Res << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] data;
    delete[] keyword;
    delete[] result;
    delete[] result2;
    delete[] result3;
    delete[] result4;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_data);
    cudaFree(d_keyword);    
    cudaFree(d_result);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    return 0;
}

$ nvcc -O3 -std=c++11 -o t703 t703.cu
$ ./t703
Before Search Data Init
After Search Data Init: 0ms
Before Search Keyword Init
After Search Keyword Init: 0ms
Before Malloc
After Malloc: 38ms
Before Memcpy
After Memcpy: 1.09805ms
Before Kernel
After Kernel: 1455.98ms
Before Kernel2
After Kernel2: 110.16ms
Before Kernel3
After Kernel3: 363.236ms
Before Kernel4
After Kernel4: 96.9751ms
Before Kernel5
After Kernel5: 10.9064ms
CPU code starts
CPU code ends: 76ms
Kernel1 computation: true
Kernel2 computation: true
Kernel3 computation: true
Kernel4 computation: true
Kernel5 computation: true
Before Deleting arrays
After Deleting arrays
Before Freeing device memory
After Freeing device memory
$

一些注意事项：

您使用cuda事件有些不正确。你应该在时间区域之外创建你的cuda事件。此外，如果您打算重新创建事件，则应首先销毁它。您将在我的代码中看到这些更改。
以上结果来自Fedora20 linux系统，其中CUDA 7运行在四核Xeon处理器和Quadro5000 GPU上。您的系统上的数字会有所不同（尽管我希望我的内核仍然比您的CPU代码更快！）
要了解有关GPU代码优化的更多信息，GTC和GTC-Express提供了许多优秀的演示文稿，here is one of them。
正如您所发现的，使用-G（调试）开关（这是Visual Studio在调试CUDA项目上执行的操作）编译CUDA代码可能会对代码性能产生重大影响。无论何时对CUDA代码进行基准测试或分析以获得性能，都不应使用-G开关。

初学者帮助CUDA代码性能

1 个答案: