Question

我在cuda中有以下内核：

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
   int j;
   int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if ((idx > 0) && (idx < N)){
      //for(j=0;j<N;j++){
      //   outgoing[j].p_t1=ingoing[j].p_t1;  
      //}
      outgoing[idx].p_t1=ingoing[idx].p_t1; 

  }
}

这不起作用。以下作品：

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
       int j;
       int idx = threadIdx.x + blockIdx.x * blockDim.x; 
        if ((idx > 0) && (idx < N)){
          for(j=0;j<N;j++){
             outgoing[j].p_t1=ingoing[j].p_t1;  
          }
          //outgoing[idx].p_t1=ingoing[idx].p_t1; 

      }
    }

有什么问题？为什么idx没有正确地索引矩阵？

整个代码写在下面。理解它并不容易。问题是，当我在主函数的末尾打印传出的[idx] .p_t1字段时，我会打印0s

outgoing[idx].p_t1=ingoing[idx].p_t1;

但是当我这样做时它们是正确的

for(j=0;j<N;j++){
   outgoing[j].p_t1=ingoing[j].p_t1;  
}

怎么了？

/******************** Includes - Defines ****************/
#include "pagerank_serial.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#include <sys/time.h>
#include <fcntl.h>
#include <cuda.h>
#include "string.h"

/******************** Defines ****************/
// Number of nodes
int N;

// Convergence threashold and algorithm's parameter d  
double threshold, d;

// Table of node's data
Node *Nodes;

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
       int j;
       int idx = threadIdx.x + blockIdx.x * blockDim.x; 
        if ((idx > 0) && (idx < N)){
          for(j=0;j<N;j++){
             outgoing[j].p_t1=ingoing[j].p_t1;  
          }
          //outgoing[idx].p_t1=ingoing[idx].p_t1; 

      }
    } 
/***** Read graph connections from txt file *****/  

void Read_from_txt_file(char* filename)
{

FILE *fid;

int from_idx, to_idx;
int temp_size;

fid = fopen(filename, "r");
if (fid == NULL){
   printf("Error opening data file\n");
}

while (!feof(fid))
{

  if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx))
  {
     Nodes[from_idx].con_size++;
     temp_size = Nodes[from_idx].con_size;
     //Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int));
     Nodes[from_idx].To_id[temp_size - 1] = to_idx;
   }
}

//printf("End of connections insertion!\n");

fclose(fid);

 }

/***** Read P vector from txt file*****/    

void Read_P_from_txt_file()
{

FILE *fid;
double temp_P;
int index = 0;

fid = fopen("P.txt", "r");
if (fid == NULL){printf("Error opening the Probabilities file\n");}

while (!feof(fid))
{
  // P's values are double!
  if (fscanf(fid," double sum = 0;%lf\n", &temp_P))
  {
     Nodes[index].p_t1 = temp_P;
     index++;   
  }
}
//printf("End of P insertion!");

fclose(fid);    

}


/***** Read E vector from txt file*****/    

void Read_E_from_txt_file()
{

FILE *fid;
double temp_E;
int index = 0;

fid = fopen("E.txt", "r");
if (fid == NULL)
  printf("Error opening the E file\n");

while (!feof(fid))
{
  // E's values are double!
  if (fscanf(fid,"%lf\n", &temp_E))
  {
     Nodes[index].e = temp_E;
     index++;   
  }
}
//printf("End of E insertion!");

fclose(fid);    

}

/***** Create P and E with equal probability *****/

void Random_P_E()
{

int i;
// Sum of P (it must be =1)
double sum_P_1 = 0;
 // Sum of E (it must be =1)
double sum_E_1 = 0; 

// Arrays initialization
for (i = 0; i < N; i++)
{
  Nodes[i].p_t0 = 0;
  Nodes[i].p_t1 = 1;
  Nodes[i].p_t1 = (double) Nodes[i].p_t1 / N;

  sum_P_1 = sum_P_1 + Nodes[i].p_t1;

  Nodes[i].e = 1;
  Nodes[i].e = (double) Nodes[i].e / N;
  sum_E_1 = sum_E_1 + Nodes[i].e;
}

// Assert sum of probabilities is =1

// Print sum of P (it must be =1)
//printf("Sum of P = %f\n",sum_P_1);

// Exit if sum of P is !=1
assert(sum_P_1 = 1);

//printf("\n");

// Print sum of E (it must be =1)
//printf("Sum of E = %f\n",sum_E_1);

// Exit if sum of Pt0 is !=1
assert(sum_E_1 = 1);

}


/***** Main function *****/   

int main(int argc, char** argv)
{

int blockSize;      // The launch configurator returned block size 
int minGridSize;    // The minimum grid size needed to achieve the maximum occupancy for a full device launch 
int gridSize;       // The actual grid size needed, based on input size 

// Check input arguments
if (argc < 5)
{
  printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n");
  return 0;
} 

// get arguments 
char filename[256];
strcpy(filename, argv[1]);
N = atoi(argv[2]);
threshold = atof(argv[3]);
d = atof(argv[4]);

int i;


// a constant value contributed of all nodes with connectivity = 0
// it's going to be addes to all node's new probability


// Allocate memory for N nodes
Nodes = (Node*) malloc(N * sizeof(Node));

for (i = 0; i < N; i++)
{
   Nodes[i].con_size = 0;
   //Nodes[i].To_id = (int*) malloc(sizeof(int));
}

Read_from_txt_file(filename);

// set random probabilities
Random_P_E();


Node *h_ingoing;

Node *h_outgoing;

h_ingoing = Nodes;

h_outgoing = (Node *)calloc(N, sizeof *h_outgoing);

Node *d_ingoing;

Node *d_outgoing;

cudaMalloc(&d_ingoing, N * sizeof *d_ingoing);

cudaMalloc(&d_outgoing, N * sizeof *d_outgoing);

cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice);

cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice);

float time;

cudaEvent_t begin, end;

cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N); 

// Round up according to array size 
gridSize = (N + blockSize - 1) / blockSize; 
printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize);

cudaEventCreate(&begin);

cudaEventCreate(&end);
cudaEventRecord(begin, 0);

pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d);

cudaEventRecord(end, 0);


cudaEventSynchronize(end);


cudaEventElapsedTime(&time, begin, end);

cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost);

printf("%f\n", time) ;



printf("\n");

// Print final probabilitities
for (i = 0; i <100; i++)
{
  printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1);
}
printf("\n");



printf("End of program!\n");

return (EXIT_SUCCESS);
}

Answer 1

当你说主要功能时，我会在时打印0s，我假设您正在引用所有条目而不仅仅是索引0.实际上，您的代码不会使用fisrt版本处理索引0 (define mix-rows (lambda (rows1 rows2) (append* (map (lambda (row1) (map (lambda (row2) (list row1 row2)) rows2)) rows1))))的{{1}}为false。

更进一步，在您的代码中，我们缺少((idx > 0) && (idx < N))类型的定义。这是必须的，以便更好地了解代码中可能出现的问题。

根据idx=0的大小，其内容以及您在编译中使用的结构包装，主机端的Node大小可能与设备上的Node大小不同。使用Node验证是否有用，或使用调试器。

此外，您似乎没有在启动时检查错误。你肯定想在内核调用后添加cudaPeekAtLastError和cudaDeviceSynchronize以确保没有发生错误。（来自cuda Runtime API的任何其他方法调用也可能返回代码未检查的错误。）

修改尝试重现，我写了以下内容，尽可能接近您的代码。我没有足够内存的卡，因此节点数量较少。

Node

除了在答案的第一稿中指出存在问题的索引0外，每个输出都是正确的。

Cuda idx没有正确地索引矩阵

1 个答案: