Question

我尝试使用CUDA c ++对许多向量值求和。我找到了两个向量的一些解决方案。如您所见，可以添加两个向量，但我想动态生成具有相同长度的向量。

def join("participant:" <> participant_id, _payload, socket) do
  if socket.assigns.participant_id == participant_id do
    send(self(), :after_participant_join)
    {:ok, socket}
  else
    {:error, %{reason: "unauthorized"}}
  end
end

def handle_info(:after_participant_join, socket) do
  experiment_id = socket.assigns.experiment_id

  Presence.track(socket, experiment_id, %{
    # keys to track
  })

  # Broadcast something
  # broadcast(socket, ...)

  {:noreply, socket}
end

intercept(["presence_diff"])

def handle_out("presence_diff", payload, socket) do
  # Only gets triggered at Presence.track, but not when the connection is closed.
  IO.puts("presence_diff triggered, payload is #{inspect(payload)}")

  leaves = payload.leaves

  for {experiment_id, meta} <- leaves do
    IO.puts("Leave information: #{meta}")

    # Do stuffs
    end
end

# This works, however.
def terminate(reason, socket) do
  IO.puts("terminated. #{inspect(reason)}")

  # Do stuffs.
end

是否有一种方法可以对许多矢量进行此操作？我的向量大小是：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// CUDA kernel. Each thread takes care of one element of c
 __global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
    c[id] = a[id] + b[id];
}

int main( int argc, char* argv[] )
{
// Size of vectors
int n = 100000;

// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;

// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;

// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);

// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);

// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);

int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
    h_a[i] = sin(i)*sin(i);
    h_b[i] = cos(i)*cos(i);
}

// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);

int blockSize, gridSize;

// Number of threads in each thread block
blockSize = 1024;

// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);

// Execute the kernel
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

// Copy array back to host
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );

// Sum up vector c and the print result divided by n, this should equal 1 
within error
double sum = 0;
for(i=0; i<n; i++)
    sum += h_c[i];
printf("final result: %f\n", sum/n);

// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

// Release host memory
free(h_a);
free(h_b);
free(h_c);

return 0;
}

结果我需要获得：

#vector length
N = 1000 
#number of vectors
i = 300000
v[i] = [1,2,..., N]

谢谢您的建议。

Answer 1

以类似于您所显示的代码的方式将多个向量加在一起（即生成元素和），等效于对矩阵的列求和。这个想法代表了实现解决方案的明智方法。

我们会将您的向量视为矩阵，其中每个向量都是矩阵中的一行。 CUDA内核将为每一列分配一个线程，并将该列的元素求和，从而产生单个数字结果。单数结果将成为整个问题向量结果的一个元素。

这是一个完整的示例，展示了一种可能的方法：

$ cat t2.cu
#include <iostream>

typedef double mt;
const int nTPB = 64;

template <typename T>
__global__ void column_sum(T *matrix, T *sums, unsigned n_vectors, unsigned vector_length){

  unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < vector_length){
    T temp = 0;
    for (unsigned i = 0; i < n_vectors; i++)
      temp += matrix[i*vector_length+idx];
    sums[idx] = temp;}
}

int main(){
  const unsigned vlen = 1000;
  const unsigned nvec = 300000;
  mt *h_matrix, *d_matrix, *h_sums, *d_sums;
  // create the desired number of vectors as a single matrix
  h_sums = new mt[vlen];
  h_matrix = new mt[vlen*nvec];
  cudaMalloc(&d_matrix, vlen*nvec*sizeof(mt));
  cudaMalloc(&d_sums, vlen*sizeof(mt));
  size_t count = 0;
  for (unsigned i = 0; i < nvec; i++)
    for (unsigned j = 0; j < vlen; j++)
      h_matrix[count++] = j;
  cudaMemcpy(d_matrix, h_matrix, vlen*nvec*sizeof(mt), cudaMemcpyHostToDevice);
  column_sum<<<(vlen+nTPB-1)/nTPB,nTPB>>>(d_matrix, d_sums, nvec, vlen);
  cudaMemcpy(h_sums, d_sums, vlen*sizeof(mt), cudaMemcpyDeviceToHost);
  for (unsigned i = 0; i < vlen; i++) if (h_sums[i] != ((mt)nvec)*i) {std::cout << " mismatch at " << i << " was: " << h_sums[i] << " should be: " << ((mt)nvec)*i << std::endl; return -1;}
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$

请注意，此方法仅在GPU上创建的线程数与矢量元素（在上例中为1000）一样多。 1000个线程就足以让最小的GPU保持繁忙。但是，如果向量长度为10,000或更长，此算法将在大多数GPU上有效。如果您想探索为小问题量创建更有效的算法，则可以研究classical parallel reduction的想法。

用Cuda C ++对向量值求和

1 个答案: