Question

我要编写一个cuda代码，它在数据字符串集中搜索关键字字符串集，并为关键字 - 数据字符串对返回一个布尔数组。

数据字符串：目前，10000 （可能会有所不同）字符串，每个字符串最多有250个字符。

关键字字符串：此时，100（可能会有所不同）字符串，每个字符串最多有100个字符。

每根弦的长度是已知的。

我的问题是，在这种情况下，以下哪种方法可能更合适。

1：
gridDim.x =＆gt;关键字字符串数量
gridDim.y =＆gt;数据串数量
blockDim =＆gt; （最大字符串大小（在这种情况下为250），1,1）
朴素算法将用于搜索每个线程都会将关键字和数据的字符加载到来自全局mem的共享内存中。
每个线程将负责天真搜索算法中的一个窗口。
结果将写入布尔数组。
因此，每个块将负责关键字 - 数据对。

第二：
gridDim =＆gt; （数据串数，1,1）
blockDim =＆gt; （关键字字符串数，1,1）
在每个块中，数据字符串将加载到共享mem。
在这种情况下，每个线程将负责关键字 - 数据对而不是块。
每个线程都会在数据字符串中搜索相应的关键字在这种情况下不需要朴素算法，可以使用Boyer-Moore。

对于大型文件中的搜索，由于数据的长度远大于关键字的长度，因此使用第一种方法。但在这种情况下，我不确定第一个appraoch是否更好。另一方面，对于第二种方法，合并关键字可能是一个问题，因为长度不固定。关键字的大小有一个上限。因此，填充可能会缓和聚结，但会消耗更多内存。

无论如何，如果您已经处理过类似案例或了解比上述情况更好的方法，请帮帮我先感谢您。

所以，我已经实现了这两种情况。
方法1的代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "iostream"
#include "chrono"
#include "cstdlib"

#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50

using namespace std;

__global__ void searchKeywordKernel(bool* resultPtr, const char * dataPtr, const short*  dataLengths, const char *  keywordPtr, const short*  keywordLengths)
{
    int dataIndex = blockIdx.x;
    int keywordIndex = blockIdx.y;
    int dataLength = dataLengths[dataIndex];
    int keywordLength = keywordLengths[keywordIndex];
    __shared__ char sData[MAXDATASTRINGSIZE];
    __shared__ char sKeyword[MAXKEYWORDSTRINGSSIZE];
    __shared__ bool isFound;

    if (dataIndex < SEARCHITEMSIZE && keywordIndex < SEARCHTERMSIZE)
    {
        if (dataLength < keywordLength)
        {
            resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = false;
        }
        else
        {
            isFound = false;
            sData[threadIdx.x] = dataPtr[dataIndex*MAXDATASTRINGSIZE + threadIdx.x];
            if (threadIdx.x < keywordLength)
                sKeyword[threadIdx.x] = keywordPtr[keywordIndex*MAXKEYWORDSTRINGSSIZE + threadIdx.x];
            __syncthreads();

            if (threadIdx.x <= dataLength - keywordLength)
            {
                for (int i = 0; i < keywordLength && !isFound; i++)
                {
                    if (sData[threadIdx.x + i] != sKeyword[i])
                        break;
                    if (i == keywordLength - 1)
                        isFound = true;
                }
            }
            resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = isFound;
        }
    }
}


int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();

    char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
    short* dataLengths = new short[SEARCHITEMSIZE];
    short temp;
    short tempChar;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        dataLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" << endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();

    char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
    short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        keywordLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    char* d_dataPtr;
    short* d_dataLengths;
    char* d_keywordPtr;
    short* d_keywordLengths;
    bool* d_resultPtr;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
    cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
    cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
    cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
    cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordPtr, keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////

    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    dim3 dimGrid(SEARCHITEMSIZE,SEARCHTERMSIZE);
    searchKeywordKernel << < dimGrid, MAXDATASTRINGSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cudaMemcpy(result, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            if (dataLengths[j] < keywordLengths[i])
            {
                cpuResult[i*SEARCHITEMSIZE + j] = false;
                break;
            }
            else
            {
                for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
                {
                    cpuResult[i*SEARCHITEMSIZE + j] = true;
                    for (int l = 0; l < keywordLengths[i]; l++)
                    {
                        if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
                        {
                            cpuResult[i*SEARCHITEMSIZE + j] = false;
                            break;
                        }
                    }
                    if (cpuResult[i*SEARCHTERMSIZE + j])
                        break;
                }
            }
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////

    ////////////////////////////////////Result Comparison////////////////////////////////////////

    bool kernelRes = true;
    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i])
        {
            kernelRes = false;
            break;
        }
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel computation: " << kernelRes << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] dataPtr;
    delete[] keywordPtr;
    delete[] dataLengths;
    delete[] keywordLengths;
    delete[] result;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_dataPtr);
    cudaFree(d_keywordPtr);
    cudaFree(d_dataLengths);
    cudaFree(d_keywordLengths);
    cudaFree(d_resultPtr);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    system("pause");
    return 0;
}

方法2的代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#include <cstdlib>

#define SEARCHTERMSIZE 198
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50

using namespace std;

__global__ void searchKeywordKernel(bool* resultPtr, const char  * __restrict__ dataPtr, const short*  dataLengths, const char *  keywordPtr, const short*  keywordLengths)
{
    int dataIndex = blockIdx.x;
    int keywordIndex = threadIdx.x;
    int dataLength = dataLengths[dataIndex];
    int keywordLength = keywordLengths[keywordIndex];
    __shared__ char sData[MAXDATASTRINGSIZE];

    if (dataIndex < SEARCHITEMSIZE)
    {
        int my_tid = keywordIndex;
        while (my_tid < dataLength)
        {
            sData[my_tid] = dataPtr[dataIndex*MAXDATASTRINGSIZE + my_tid];
            my_tid += blockDim.x;
        }
        __syncthreads();
        if (keywordIndex < SEARCHTERMSIZE)
        {
            if (dataLength < keywordLength)
            {
                resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = false;
            }
            else
            {
                bool isFound = true;
                for (int i = 0; i <= dataLength - keywordLength; i++)
                {
                    for (int j = 0; j < keywordLength; j++)
                    {
                        if (sData[i + j] != keywordPtr[j*SEARCHTERMSIZE + keywordIndex])
                        {
                            isFound = false;
                            break;
                        }
                    }
                    if (isFound)
                        break;
                }
                resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = isFound;
            }
        }
    }
}


int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();

    char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
    short* dataLengths = new short[SEARCHITEMSIZE];
    short temp;
    short tempChar;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        dataLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" << endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();

    char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
    short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        keywordLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    ////////////////////Traverse Keyword Array////////////////////////////

    char* keywordPtr_T = new char[SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE];
    for (int i = 0; i < SEARCHTERMSIZE; i++)
        for (int j = 0; j < MAXKEYWORDSTRINGSSIZE; j++)
            keywordPtr_T[j*SEARCHTERMSIZE + i] = keywordPtr[i*MAXKEYWORDSTRINGSSIZE + j];

    ////////////////////Traverse Keyword Array////////////////////////////  

    char* d_dataPtr;
    short* d_dataLengths;
    char* d_keywordPtr;
    short* d_keywordLengths;
    bool* d_resultPtr;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
    cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
    cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
    cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
    cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordPtr, keywordPtr_T, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////

    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    searchKeywordKernel << < SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    bool* result_T = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cudaMemcpy(result_T, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    for (int i = 0; i < SEARCHTERMSIZE; i++)
        for (int j = 0; j < SEARCHITEMSIZE; j++)
            result[j*SEARCHTERMSIZE + i] = result_T[i*SEARCHITEMSIZE + j];

    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            if (dataLengths[j] < keywordLengths[i])
            {
                cpuResult[i*SEARCHITEMSIZE + j] = false;
                break;
            }
            else
            {
                for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
                {
                    cpuResult[i*SEARCHITEMSIZE + j] = true;
                    for (int l = 0; l < keywordLengths[i]; l++)
                    {
                        if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
                        {
                            cpuResult[i*SEARCHITEMSIZE + j] = false;
                            break;
                        }
                    }
                    if (cpuResult[i*SEARCHTERMSIZE + j])
                        break;
                }
            }
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////

    ////////////////////////////////////Result Comparison////////////////////////////////////////

    bool kernelRes = true;
    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i])
        {
            kernelRes = false;
            break;
        }
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel computation: " << kernelRes << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] dataPtr;
    delete[] keywordPtr;
    delete[] keywordPtr_T;
    delete[] dataLengths;
    delete[] keywordLengths;
    delete[] result;
    delete[] result_T;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_dataPtr);
    cudaFree(d_keywordPtr);
    cudaFree(d_dataLengths);
    cudaFree(d_keywordLengths);
    cudaFree(d_resultPtr);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    system("pause");
    return 0;
}

第二种方法比第一种方法产生更好的结果。然而，第二种方法的表现取决于关键字的数量。如果关键字的数量是192的倍数，则gpu的性能高于cpu（malloc的时间+ memcpy +内核＆lt; cpu的时间）。

我该怎么做才能克服这种依赖性？

增加线程数并传递多个数据字符串而不是每个块中有一个是否可行？

Answer 1

我建议blockDim = (16, 16, 1)和gridDim = (# of data strings / 16, # of keyword strings / 16, 1)。在您的情况下，数十个字符串可以理想地适合共享内存，这种块网格划分将导致最小的全局内存访问，同时不会引入计算开销。

填充不是一个好的选择，除非预期每个字符串的长度非常接近最大值（例如最大值的80％）。如果保留每个字符串的偏移量数组（CPU擅长生成它），则合并全局内存读取只是微不足道。

有关Cuda的字符串搜索的建议

1 个答案: