所以我在CUDA中实现了(或者至少尝试过)一个Sobel滤波器,我的代码如下。当我执行这个文件时,我得到一半正确的索贝尔滤波图像,另一半是黑色。我无法上传图片,因为它们是.pgm格式。因此,代码所做的是在.pgm格式的灰度图像中读取,并使用共享存储器概念将Sobel滤镜掩码与其进行卷积。我使用1024 x 1024 .pgm图像作为输入,它返回一个Sobel滤镜图像,其边缘有一半水平切割,因此下半部分为黑色。有人可以帮帮我吗。另外,我对代码有点好奇,我真的不明白第二批加载的作用,所以你能解释一下。


/* sobel.cu */

#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>
#include "mypgm.h"

#define Mask_width  3
#define Mask_radius Mask_width/2
#define TILE_WIDTH 16
#define w (TILE_WIDTH + Mask_width - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))

__global__ void convolution(float *I, const float* __restrict__ M, float *P, int width, int height) {
    __shared__ float N_ds[w][w];
    int k;

    // First batch loading
    int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
        destY = dest / w, destX = dest % w,
        srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius,
        srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius,
        src = srcY * width + srcX;
    if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
        N_ds[destY][destX] = I[src];
        N_ds[destY][destX] = 0;
    for (int iter = 1; iter <= (w*w) / (TILE_WIDTH*TILE_WIDTH); iter++)
        // Second batch loading
        dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
        destY = dest / w, destX = dest % w;
        srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
        srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
        src = srcY * width + srcX;
        if (destY < w) {
            if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
                N_ds[destY][destX] = I[src];
                N_ds[destY][destX] = 0;

    float accum = 0;
    int y, x;
    for (y = 0; y < Mask_width; y++)
        for (x = 0; x < Mask_width; x++)
            accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];

    y = blockIdx.y * TILE_WIDTH + threadIdx.y;
    x = blockIdx.x * TILE_WIDTH + threadIdx.x;
    if (y < height && x < width)
        P[y * width + x] = accum;



void sobel_filtering()
/* Spatial filtering of image data */
/* Sobel filter (horizontal differentiation */
/* Input: image1[y][x] ---- Outout: image2[y][x] */

    /* Definition of Sobel filter in horizontal direction */
    float weight[3][3] = { { -1,  0,  1 },
              { -2,  0,  2 },
              { -1,  0,  1 } };
    float pixel_value;

    int x, y, i, j;  /* Loop variable */
    float * deviceInputImageData;
    float * deviceOutputImageData;
    float * deviceMaskData;

    cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
    cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));
    cudaMalloc((void **)&deviceMaskData, 3 * 3 * sizeof(float));

    cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceMaskData, weight, 3 * 3 * sizeof(float), cudaMemcpyHostToDevice);

    /* Maximum values calculation after filtering*/
    printf("Now, filtering of input image is performed\n\n");
    x_size2 = x_size1;
    y_size2 = y_size1;
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            image2[y][x] = 0;

    dim3 dimGrid(ceil((float)x_size1 / TILE_WIDTH), ceil((float)y_size1 / TILE_WIDTH));
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
    convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, x_size1, y_size1);

        x_size2 * y_size2 * sizeof(float),



int main()
    load_image_data();   /* Input of image1 */

    clock_t begin = clock();
    sobel_filtering();   /* Sobel filter is applied to image1 */
    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("\n\nTiming result of multiplication of matrix-vector: %f\n", time_spent);
    save_image_data();   /* Output of image2 */
    return 0;


/* pgm file IO headerfile ------ mypgm.h */

/* Constant declaration */

//#define MAX_IMAGESIZE  1024

#define MAX_IMAGEWIDTH  3840
#define MAX_IMAGEHEIGHT  2160
#define MAX_BRIGHTNESS  255 /* Maximum gray level */
#define GRAYLEVEL       256 /* No. of gray levels */
#define MAX_FILENAME    256 /* Filename length limit */
#define MAX_BUFFERSIZE  256

/* Global constant declaration */
/* Image storage arrays */
int x_size1, y_size1, /* width & height of image1*/
x_size2, y_size2; /* width & height of image2 */

/* Prototype declaration of functions */
void load_image_data( ); /* image input */
void save_image_data( ); /* image output*/
void load_image_file(char *); /* image input */
void save_image_file(char *); /* image output*/

/* Main body of functions */

void load_image_data()
/* Input of header & body information of pgm file */
/* for image1[ ][ ],x_size1,y_size1 */
    char file_name[MAX_FILENAME];
    char buffer[MAX_BUFFERSIZE];
    FILE *fp; /* File pointer */
    int max_gray; /* Maximum gray level */
    int x, y; /* Loop variable */

    /* Input file open */
    printf("Monochromatic image file input routine \n");
    printf("     Only pgm binary file is acceptable\n\n");
    printf("Name of input image file? (*.pgm) : ");
    scanf("%s", file_name);
    fp = fopen(file_name, "rb");
    if (NULL == fp) {
        printf("     The file doesn't exist!\n\n");
    /* Check of file-type ---P5 */
    fgets(buffer, MAX_BUFFERSIZE, fp);
    if (buffer[0] != 'P' || buffer[1] != '5') {
        printf("     Mistaken file format, not P5!\n\n");
    /* input of x_size1, y_size1 */
    x_size1 = 0;
    y_size1 = 0;
    while (x_size1 == 0 || y_size1 == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d %d", &x_size1, &y_size1);
    /* input of max_gray */
    max_gray = 0;
    while (max_gray == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d", &max_gray);
    /* Display of parameters */
    printf("\n     Image width = %d, Image height = %d\n", x_size1, y_size1);
    printf("     Maximum gray level = %d\n\n", max_gray);
    if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
        printf("     Image size exceeds %d x %d\n\n",
        printf("     Please use smaller images!\n\n");
    if (max_gray != MAX_BRIGHTNESS) {
        printf("     Invalid value of maximum gray level!\n\n");
    /* Input of image data*/
    for (y = 0; y < y_size1; y++) {
        for (x = 0; x < x_size1; x++) {
            image1[y][x] = (unsigned char)fgetc(fp);
    printf("-----Image data input OK-----\n\n");

void save_image_data()
/* Output of image2[ ][ ], x_size2, y_size2 in pgm format*/

    char file_name[MAX_FILENAME];
    FILE *fp; /* File pointer */
    int x, y; /* Loop variable */

    /* Output file open */
    printf("Monochromatic image file output routine\n");
    printf("Name of output image file? (*.pgm) : ");
    scanf("%s", file_name);
    fp = fopen(file_name, "wb");
    /* output of pgm file header information */
    fputs("P5\n", fp);
    fputs("# Created by Image Processing\n", fp);
    fprintf(fp, "%d %d\n", x_size2, y_size2);
    fprintf(fp, "%d\n", MAX_BRIGHTNESS);
    /* Output of image data */
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            fputc(image2[y][x], fp);
    printf("\n-----Image data output OK-----\n\n");

void load_image_file(char *filename)
/* Input of header & body information of pgm file */
/* for image1[ ][ ],x_size1,y_size1 */
    char buffer[MAX_BUFFERSIZE];
    FILE *fp; /* File pointer */
    int max_gray; /* Maximum gray level */
    int x, y; /* Loop variable */

    /* Input file open */
    fp = fopen(filename, "rb");
    if (NULL == fp) {
        printf("     The file doesn't exist!\n\n");
    /* Check of file-type ---P5 */
    fgets(buffer, MAX_BUFFERSIZE, fp);
    if (buffer[0] != 'P' || buffer[1] != '5') {
        printf("     Mistaken file format, not P5!\n\n");
    /* input of x_size1, y_size1 */
    x_size1 = 0;
    y_size1 = 0;
    while (x_size1 == 0 || y_size1 == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d %d", &x_size1, &y_size1);
    /* input of max_gray */
    max_gray = 0;
    while (max_gray == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d", &max_gray);
    if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
        printf("     Image size exceeds %d x %d\n\n",
        printf("     Please use smaller images!\n\n");
    if (max_gray != MAX_BRIGHTNESS) {
        printf("     Invalid value of maximum gray level!\n\n");
    /* Input of image data*/
    for (y = 0; y < y_size1; y++) {
        for (x = 0; x < x_size1; x++) {
            image1[y][x] = (float)fgetc(fp);

void save_image_file(char *filename)
/* Output of image2[ ][ ], x_size2, y_size2 */
/* into pgm file with header & body information */
    FILE *fp; /* File pointer */
    int x, y; /* Loop variable */

    fp = fopen(filename, "wb");
    /* output of pgm file header information */
    fputs("P5\n", fp);
    fputs("# Created by Image Processing\n", fp);
    fprintf(fp, "%d %d\n", x_size2, y_size2);
    fprintf(fp, "%d\n", MAX_BRIGHTNESS);
    /* Output of image data */
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            fputc(image2[y][x], fp);

#define MAX_IMAGEWIDTH  3840
#define MAX_IMAGEHEIGHT  2160


cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));


然后当您从主机到设备进行复制时(在设备上发生类似的问题 - >主机副本):

cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);







#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>

#define MAX_IMAGEWIDTH  2048
#define MAX_IMAGEHEIGHT 2048
#define MAX_BRIGHTNESS  255 /* Maximum gray level */
#define GRAYLEVEL       256 /* No. of gray levels */
#define MAX_FILENAME    256 /* Filename length limit */
#define MAX_BUFFERSIZE  256

/* Global constant declaration */
/* Image storage arrays */

float *image1, *image2;
int x_size1, y_size1, /* width & height of image1*/
x_size2, y_size2; /* width & height of image2 */

/* Prototype declaration of functions */
void load_image_data( ); /* image input */
void save_image_data( ); /* image output*/
void load_image_file(char *); /* image input */
void save_image_file(char *); /* image output*/

/* Main body of functions */

void load_image_data()
/* Input of header & body information of pgm file */
/* for image1[ ][ ],x_size1,y_size1 */
    char file_name[MAX_FILENAME];
    char buffer[MAX_BUFFERSIZE];
    FILE *fp; /* File pointer */
    int max_gray; /* Maximum gray level */
    int x, y; /* Loop variable */

    /* Input file open */
    printf("Monochromatic image file input routine \n");
    printf("     Only pgm binary file is acceptable\n\n");
    printf("Name of input image file? (*.pgm) : ");
    scanf("%s", file_name);
    fp = fopen(file_name, "rb");
    if (NULL == fp) {
        printf("     The file doesn't exist!\n\n");
    /* Check of file-type ---P5 */
    fgets(buffer, MAX_BUFFERSIZE, fp);
    if (buffer[0] != 'P' || buffer[1] != '5') {
        printf("     Mistaken file format, not P5!\n\n");
    /* input of x_size1, y_size1 */
    x_size1 = 0;
    y_size1 = 0;
    while (x_size1 == 0 || y_size1 == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d %d", &x_size1, &y_size1);
    /* input of max_gray */
    max_gray = 0;
    while (max_gray == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d", &max_gray);
    /* Display of parameters */
    printf("\n     Image width = %d, Image height = %d\n", x_size1, y_size1);
    printf("     Maximum gray level = %d\n\n", max_gray);
    if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
        printf("     Image size exceeds %d x %d\n\n",
        printf("     Please use smaller images!\n\n");
    if (max_gray != MAX_BRIGHTNESS) {
        printf("     Invalid value of maximum gray level!\n\n");
    image1 = (float *)malloc(x_size1*y_size1*sizeof(float));
    /* Input of image data*/
    for (y = 0; y < y_size1; y++) {
        for (x = 0; x < x_size1; x++) {
            image1[y*x_size1+x] = (unsigned char)fgetc(fp);
    printf("-----Image data input OK-----\n\n");

void save_image_data()
/* Output of image2[ ][ ], x_size2, y_size2 in pgm format*/

    char file_name[MAX_FILENAME];
    FILE *fp; /* File pointer */
    int x, y; /* Loop variable */

    /* Output file open */
    printf("Monochromatic image file output routine\n");
    printf("Name of output image file? (*.pgm) : ");
    scanf("%s", file_name);
    fp = fopen(file_name, "wb");
    /* output of pgm file header information */
    fputs("P5\n", fp);
    fputs("# Created by Image Processing\n", fp);
    fprintf(fp, "%d %d\n", x_size2, y_size2);
    fprintf(fp, "%d\n", MAX_BRIGHTNESS);
    /* Output of image data */
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            fputc(image2[y*x_size2+x], fp);
    printf("\n-----Image data output OK-----\n\n");

#define Mask_width  3
#define Mask_radius Mask_width/2
#define TILE_WIDTH 16
#define w (TILE_WIDTH + Mask_width - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))

__global__ void convolution(float *I, const float* __restrict__ M, float *P, int width, int height) {
    __shared__ float N_ds[w][w];

    // First batch loading
    int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
        destY = dest / w, destX = dest % w,
        srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius,
        srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius,
        src = srcY * width + srcX;
    if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
        N_ds[destY][destX] = I[src];
        N_ds[destY][destX] = 0;
    for (int iter = 1; iter <= (w*w) / (TILE_WIDTH*TILE_WIDTH); iter++)
        // Second batch loading
        dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
        destY = dest / w, destX = dest % w;
        srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
        srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
        src = srcY * width + srcX;
        if (destY < w) {
            if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
                N_ds[destY][destX] = I[src];
                N_ds[destY][destX] = 0;

    float accum = 0;
    int y, x;
    for (y = 0; y < Mask_width; y++)
        for (x = 0; x < Mask_width; x++)
            accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];

    y = blockIdx.y * TILE_WIDTH + threadIdx.y;
    x = blockIdx.x * TILE_WIDTH + threadIdx.x;
    if (y < height && x < width)
        P[y * width + x] = accum;


void sobel_filtering()
/* Spatial filtering of image data */
/* Sobel filter (horizontal differentiation */
/* Input: image1[y][x] ---- Outout: image2[y][x] */

    /* Definition of Sobel filter in horizontal direction */
    float weight[3][3] = { { -1,  0,  1 },
              { -2,  0,  2 },
              { -1,  0,  1 } };

    int x, y;  /* Loop variable */
    float * deviceInputImageData;
    float * deviceOutputImageData;
    float * deviceMaskData;

    cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
    cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));
    cudaMalloc((void **)&deviceMaskData, 3 * 3 * sizeof(float));

    cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceMaskData, weight, 3 * 3 * sizeof(float), cudaMemcpyHostToDevice);

    /* Maximum values calculation after filtering*/
    printf("Now, filtering of input image is performed\n\n");
    x_size2 = x_size1;
    y_size2 = y_size1;
    image2 = (float *)malloc(x_size2*y_size2*sizeof(float));
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            image2[y*x_size2+x] = 0;

    dim3 dimGrid(ceil((float)x_size1 / TILE_WIDTH), ceil((float)y_size1 / TILE_WIDTH));
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
    convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, x_size1, y_size1);

        x_size2 * y_size2 * sizeof(float),



int main()
    load_image_data();   /* Input of image1 */

    clock_t begin = clock();
    sobel_filtering();   /* Sobel filter is applied to image1 */
    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("\n\nTiming result of multiplication of matrix-vector: %f\n", time_spent);
    save_image_data();   /* Output of image2 */
    return 0;

注意上面的代码(你的PGM加载例程)有一个(IMO)缺陷,它需要在PGM文件的同一行指定x和y大小,但据我所知,这不是一个P5 PGM文件的要求。如果您传递一个有效的P5 PGM文件,该文件在文件的不同行上指定了x和y大小,它将挂起。我没有尝试解决这个问题,因为它似乎不是你要问的问题。