Question

我有以下代码行：

load()
{
    float* host;
    // init host done

    float** tran;

    printf("testing...\n");

    transpose(host, tran, 600);

    printf("testing. 3..\n");

    printf(" value =%d \n", tran[0][0]); // segmentation here

    printf("done...\n");
}

void transpose(float *input, float** output, int width)
{
    int size = 128*width*sizeof(float);

    // Allocate space on the GPU for input and output
    float* GPU_input = 0;   
    float** GPU_output;

    cudaMalloc(&GPU_input, size);
    cudaMalloc(&GPU_output, size);

    // Copy the input data to the GPU (host to device)
    cudaMemcpy(GPU_input, input, size, cudaMemcpyHostToDevice);

    dim3 threads(32, 32);
    dim3 grid(width/32+1, 128/32);

    printf("OK...\n");

    kernel_transpose<<< grid, threads >>>(GPU_input, GPU_output);

    printf("OK 2...\n");

    // Copy the input data to the GPU (host to device)
    cudaMemcpy(output, GPU_output, size, cudaMemcpyDeviceToHost);

    printf("OK 3...\n");

    cudaFree(GPU_input);
    cudaFree(GPU_output);

    printf("testing. 2..\n");
}

__global__ void kernel_transpose(float *array, float** output)
{
    int index_x = blockIdx.x*blockDim.x + threadIdx.x;
    int index_y = blockIdx.y*blockDim.y + threadIdx.y;

    output[index_x][index_y] = array[index_x+index_y];
}

编译并运行我得到了：

testing...
OK...
OK 2...
OK 3...
testing. 2..
testing. 3..
line 84: 26819 Segmentation fault

为什么我出现“分段错误”？如果可能的话，如何修复它？

Answer 1

在您使用它们之前，需要分配和初始化

host和tran。例如，在C中，您使用malloc和free，确保在分配错误的情况下测试返回值：

// for strerror
#include <string.h>
...

int iobufsize(width) {
    return 128*width*sizeof(float);
}

float** alloc_matrix(...) {
    float **matrix, *pool;
    int i, eno=0;
    // allocate array of array pointers
    if ((matrix = (float**)malloc( ... ))) {
        /* Allocate array to hold data. This array will be divided up to
           create the arrays used in the matrix. Alternatively, each matrix array
           could be allocated separately.
         */
        if ((pool = (float*) malloc( ... ))) {
            for (i=0; i < ... ; i++) {
                // each item in matrix points to a subarray of the data array
                matrix[i] = pool + i * ...;
            }
        } else {
            /* free() shouldn't change errno, but this isn't guaranteed 
               for every implementation, so save errno to be safe.
             */
            eno = errno;
            free(matrix);
            errno = eno;
            matrix = NULL;
        }
    }
    return matrix;
}
void free_matrix(float **matrix, ...) {
    if (matrix) {
        free(*matrix);
        free(matrix);
    }
}

void load() {
    float* host;
    float** tran;
    int width = 600;
    int size = iobufsize(width);

    if ((host = (float*)malloc(size))) {
        if ((tran = alloc_matrix( ... ))) {
            // initialize 'host' buffer somehow
            ...

            printf("testing\n");

            transpose(host, tran, width);

            printf("testing 3\n");
            printf(" value =%d \n", tran[0][0]);
            printf("done\n");

            free_matrix(tran, ...);
            free(host);
        } else {
            /* Note: strerror isn't thread safe. If load() is run concurrently,
               make use of strerror_r instead.
             */
            fprintf(stderr, "Couldn't allocate output buffer: %s.\n", strerror(errno));
            free(host);
        }
    } else {
        fprintf(stderr, "Couldn't allocate input buffer: %s.\n", strerror(errno));
    }
}

作为改进，您可以将矩阵（数组指针数组）与矩阵尺寸一起捆绑到结构中。这是创建矩阵类的一步。

在C ++中，您使用new[]和delete[]，为host和tran创建类，这些类将是管理交互的另一个类的成员。这简化了内存管理（特别是在new[]抛出bad_alloc时），因为您可以应用RAII模式。

您的代码还存在其他一些问题。当您分配（例如使用cudaMalloc）或复制（例如使用cudaMemcpy）内存时，不会对内容应用语义（例如指针语义）。这对您的代码有很多影响。首先，每个分配只能给你一维数组。如果需要更高维数组，则必须将指针数组分配给低维数组，分配较低维数组，然后将较高维数组中的指针设置为指向较低维数组（如alloc_matrix中所述））。例如，cudaMalloc(&GPU_output, size);创建一个float**的一维数组，但不初始化内容。因此，您没有有效的数组指针数组。

此外，GPU_output的大小是错误的; size是整体数据大小，而不是更高维数组的大小。这个错误来自于处理1维和2维阵列相同;他们不是。一维阵列是连续的存储区域，其中内容是所有具有相同布局的对象序列（即，阵列项是同构的）。 2维数组是指向数组的指针数组。在这两种情况下，代码通过指向数组第一项的指针访问数组（例如host是指向第一个浮点数据的指针，trans是指向第一个数组的第一个指针的指针，这是指向第一个浮点数据的指针。 alloc_matrix构造数组，以便您可以轻松地将数据视为具有1或2维，因为数组本身是连续的（它们取自pool指向的连续内存区域）。 host[x][y]将数据视为具有2个维度。 (*host)[x*n+y]（或host[0][x*n+y]），其中n是单个数据数组的大小，将数据视为具有1维。

你得到了什么：

float **GPU_output --> | float* | -> random memory location
                       | float* | -> random memory location
                          ....       (size / sizeof(float**) pointers total)
                       | float* | -> random memory location

你想要什么

float **GPU_output --> | float* | --> | float |
                                      | float |
                                         ...     
                                      | float |
                       | float* | --> | float |
                                      | float |
                                         ...     
                                      | float |  
                                  ...            
                       | float* | --> | float |
                                      | float |
                                         ...     
                                      | float |

从数组C ++中读取元素的分段错误

1 个答案: