有关Cuda的字符串搜索的建议

时间:2015-04-15 10:55:28

标签: cuda parallel-processing

我要编写一个cuda代码,它在数据字符串集中搜索关键字字符串集,并为关键字 - 数据字符串对返回一个布尔数组。

数据字符串:目前,10000 (可能会有所不同)字符串,每个字符串最多有250个字符。

关键字字符串:此时,100(可能会有所不同)字符串,每个字符串最多有100个字符。

每根弦的长度是已知的。

我的问题是,在这种情况下,以下哪种方法可能更合适。

1:
gridDim.x =>关键字字符串数量
gridDim.y =>数据串数量
blockDim => (最大字符串大小(在这种情况下为250),1,1)
朴素算法将用于搜索 每个线程都会将关键字和数据的字符加载到来自全局mem的共享内存中。
每个线程将负责天真搜索算法中的一个窗口。
结果将写入布尔数组。
因此,每个块将负责关键字 - 数据对。

第二:
gridDim => (数据串数,1,1)
blockDim => (关键字字符串数,1,1)
在每个块中,数据字符串将加载到共享mem。
在这种情况下,每个线程将负责关键字 - 数据对而不是块。
每个线程都会在数据字符串中搜索相应的关键字 在这种情况下不需要朴素算法,可以使用Boyer-Moore。

对于大型文件中的搜索,由于数据的长度远大于关键字的长度,因此使用第一种方法。但在这种情况下,我不确定第一个appraoch是否更好。另一方面,对于第二种方法,合并关键字可能是一个问题,因为长度不固定。关键字的大小有一个上限。因此,填充可能会缓和聚结,但会消耗更多内存。

无论如何,如果您已经处理过类似案例或了解比上述情况更好的方法,请帮帮我 先感谢您。

所以,我已经实现了这两种情况。
方法1的代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "iostream"
#include "chrono"
#include "cstdlib"

#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50

using namespace std;

__global__ void searchKeywordKernel(bool* resultPtr, const char * dataPtr, const short*  dataLengths, const char *  keywordPtr, const short*  keywordLengths)
{
    int dataIndex = blockIdx.x;
    int keywordIndex = blockIdx.y;
    int dataLength = dataLengths[dataIndex];
    int keywordLength = keywordLengths[keywordIndex];
    __shared__ char sData[MAXDATASTRINGSIZE];
    __shared__ char sKeyword[MAXKEYWORDSTRINGSSIZE];
    __shared__ bool isFound;

    if (dataIndex < SEARCHITEMSIZE && keywordIndex < SEARCHTERMSIZE)
    {
        if (dataLength < keywordLength)
        {
            resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = false;
        }
        else
        {
            isFound = false;
            sData[threadIdx.x] = dataPtr[dataIndex*MAXDATASTRINGSIZE + threadIdx.x];
            if (threadIdx.x < keywordLength)
                sKeyword[threadIdx.x] = keywordPtr[keywordIndex*MAXKEYWORDSTRINGSSIZE + threadIdx.x];
            __syncthreads();

            if (threadIdx.x <= dataLength - keywordLength)
            {
                for (int i = 0; i < keywordLength && !isFound; i++)
                {
                    if (sData[threadIdx.x + i] != sKeyword[i])
                        break;
                    if (i == keywordLength - 1)
                        isFound = true;
                }
            }
            resultPtr[keywordIndex*SEARCHITEMSIZE + dataIndex] = isFound;
        }
    }
}


int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();

    char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
    short* dataLengths = new short[SEARCHITEMSIZE];
    short temp;
    short tempChar;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        dataLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" << endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();

    char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
    short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        keywordLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    char* d_dataPtr;
    short* d_dataLengths;
    char* d_keywordPtr;
    short* d_keywordLengths;
    bool* d_resultPtr;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
    cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
    cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
    cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
    cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordPtr, keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////

    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    dim3 dimGrid(SEARCHITEMSIZE,SEARCHTERMSIZE);
    searchKeywordKernel << < dimGrid, MAXDATASTRINGSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cudaMemcpy(result, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            if (dataLengths[j] < keywordLengths[i])
            {
                cpuResult[i*SEARCHITEMSIZE + j] = false;
                break;
            }
            else
            {
                for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
                {
                    cpuResult[i*SEARCHITEMSIZE + j] = true;
                    for (int l = 0; l < keywordLengths[i]; l++)
                    {
                        if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
                        {
                            cpuResult[i*SEARCHITEMSIZE + j] = false;
                            break;
                        }
                    }
                    if (cpuResult[i*SEARCHTERMSIZE + j])
                        break;
                }
            }
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////

    ////////////////////////////////////Result Comparison////////////////////////////////////////

    bool kernelRes = true;
    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i])
        {
            kernelRes = false;
            break;
        }
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel computation: " << kernelRes << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] dataPtr;
    delete[] keywordPtr;
    delete[] dataLengths;
    delete[] keywordLengths;
    delete[] result;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_dataPtr);
    cudaFree(d_keywordPtr);
    cudaFree(d_dataLengths);
    cudaFree(d_keywordLengths);
    cudaFree(d_resultPtr);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    system("pause");
    return 0;
}

方法2的代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#include <cstdlib>

#define SEARCHTERMSIZE 198
#define SEARCHITEMSIZE 65000
#define MAXDATASTRINGSIZE 250
#define MAXKEYWORDSTRINGSSIZE 50

using namespace std;

__global__ void searchKeywordKernel(bool* resultPtr, const char  * __restrict__ dataPtr, const short*  dataLengths, const char *  keywordPtr, const short*  keywordLengths)
{
    int dataIndex = blockIdx.x;
    int keywordIndex = threadIdx.x;
    int dataLength = dataLengths[dataIndex];
    int keywordLength = keywordLengths[keywordIndex];
    __shared__ char sData[MAXDATASTRINGSIZE];

    if (dataIndex < SEARCHITEMSIZE)
    {
        int my_tid = keywordIndex;
        while (my_tid < dataLength)
        {
            sData[my_tid] = dataPtr[dataIndex*MAXDATASTRINGSIZE + my_tid];
            my_tid += blockDim.x;
        }
        __syncthreads();
        if (keywordIndex < SEARCHTERMSIZE)
        {
            if (dataLength < keywordLength)
            {
                resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = false;
            }
            else
            {
                bool isFound = true;
                for (int i = 0; i <= dataLength - keywordLength; i++)
                {
                    for (int j = 0; j < keywordLength; j++)
                    {
                        if (sData[i + j] != keywordPtr[j*SEARCHTERMSIZE + keywordIndex])
                        {
                            isFound = false;
                            break;
                        }
                    }
                    if (isFound)
                        break;
                }
                resultPtr[dataIndex*SEARCHTERMSIZE + keywordIndex] = isFound;
            }
        }
    }
}


int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();

    char* dataPtr = (char*)malloc(sizeof(char)*MAXDATASTRINGSIZE*SEARCHITEMSIZE);
    short* dataLengths = new short[SEARCHITEMSIZE];
    short temp;
    short tempChar;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        temp = rand() % (MAXDATASTRINGSIZE - 20) + 20;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            dataPtr[i*MAXDATASTRINGSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        dataLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" << endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();

    char* keywordPtr = (char*)malloc(sizeof(char)*MAXKEYWORDSTRINGSSIZE*SEARCHTERMSIZE);
    short* keywordLengths = new short[SEARCHTERMSIZE]; //lenghts, not the start positions
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        temp = rand() % (MAXKEYWORDSTRINGSSIZE - 10) + 10;
        for (int k = 0; k < temp; k++)
        {
            tempChar = rand() % 26;
            keywordPtr[i*MAXKEYWORDSTRINGSSIZE + k] = 97 + tempChar;  //97->a, 98->b, 122->z
        }
        keywordLengths[i] = temp;
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    ////////////////////Traverse Keyword Array////////////////////////////

    char* keywordPtr_T = new char[SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE];
    for (int i = 0; i < SEARCHTERMSIZE; i++)
        for (int j = 0; j < MAXKEYWORDSTRINGSSIZE; j++)
            keywordPtr_T[j*SEARCHTERMSIZE + i] = keywordPtr[i*MAXKEYWORDSTRINGSSIZE + j];

    ////////////////////Traverse Keyword Array////////////////////////////  

    char* d_dataPtr;
    short* d_dataLengths;
    char* d_keywordPtr;
    short* d_keywordLengths;
    bool* d_resultPtr;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE);
    cudaMalloc(&d_dataLengths, sizeof(short) * SEARCHITEMSIZE);
    cudaMalloc(&d_keywordPtr, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE);
    cudaMalloc(&d_keywordLengths, sizeof(short) * SEARCHTERMSIZE);
    cudaMalloc(&d_resultPtr, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_dataPtr, dataPtr, sizeof(char) * SEARCHITEMSIZE * MAXDATASTRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dataLengths, dataLengths, sizeof(short) * SEARCHITEMSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordPtr, keywordPtr_T, sizeof(char) * SEARCHTERMSIZE*MAXKEYWORDSTRINGSSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keywordLengths, keywordLengths, sizeof(short) * SEARCHTERMSIZE, cudaMemcpyHostToDevice);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////

    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    searchKeywordKernel << < SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_resultPtr, d_dataPtr, d_dataLengths, d_keywordPtr, d_keywordLengths);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    bool* result_T = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cudaMemcpy(result_T, d_resultPtr, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    for (int i = 0; i < SEARCHTERMSIZE; i++)
        for (int j = 0; j < SEARCHITEMSIZE; j++)
            result[j*SEARCHTERMSIZE + i] = result_T[i*SEARCHITEMSIZE + j];

    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            if (dataLengths[j] < keywordLengths[i])
            {
                cpuResult[i*SEARCHITEMSIZE + j] = false;
                break;
            }
            else
            {
                for (int k = 0; k <= dataLengths[j] - keywordLengths[i]; k++)
                {
                    cpuResult[i*SEARCHITEMSIZE + j] = true;
                    for (int l = 0; l < keywordLengths[i]; l++)
                    {
                        if (dataPtr[j*MAXDATASTRINGSIZE + k + l] != keywordPtr[i*MAXKEYWORDSTRINGSSIZE + l])
                        {
                            cpuResult[i*SEARCHITEMSIZE + j] = false;
                            break;
                        }
                    }
                    if (cpuResult[i*SEARCHTERMSIZE + j])
                        break;
                }
            }
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////

    ////////////////////////////////////Result Comparison////////////////////////////////////////

    bool kernelRes = true;
    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i])
        {
            kernelRes = false;
            break;
        }
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel computation: " << kernelRes << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] dataPtr;
    delete[] keywordPtr;
    delete[] keywordPtr_T;
    delete[] dataLengths;
    delete[] keywordLengths;
    delete[] result;
    delete[] result_T;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_dataPtr);
    cudaFree(d_keywordPtr);
    cudaFree(d_dataLengths);
    cudaFree(d_keywordLengths);
    cudaFree(d_resultPtr);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    system("pause");
    return 0;
}

第二种方法比第一种方法产生更好的结果。然而,第二种方法的表现取决于关键字的数量。如果关键字的数量是192的倍数,则gpu的性能高于cpu(malloc的时间+ memcpy +内核&lt; cpu的时间)。

我该怎么做才能克服这种依赖性?

增加线程数并传递多个数据字符串而不是每个块中有一个是否可行?

1 个答案:

答案 0 :(得分:1)

我建议blockDim = (16, 16, 1)gridDim = (# of data strings / 16, # of keyword strings / 16, 1)。在您的情况下,数十个字符串可以理想地适合共享内存,这种块网格划分将导致最小的全局内存访问,同时不会引入计算开销。

填充不是一个好的选择,除非预期每个字符串的长度非常接近最大值(例如最大值的80%)。如果保留每个字符串的偏移量数组(CPU擅长生成它),则合并全局内存读取只是微不足道。