我是c ++的新手,但设法为tensorflow设计了原始的新cpu op。现在我想为gpu编写一个op。我对open-cl工作有一点经验。我在这里关注指南:
https://www.tensorflow.org/versions/r0.11/how_tos/adding_an_op/index.html#gpu-support
下面是我的c ++代码,后面是cuda文件。我不会对此代码做任何事情。它编译正确但每次我尝试运行它我得到一个核心转储。出于调试的目的,我已经删除了我班级的所有内容,以便我可以专注于这个问题。它基本上也说了这个:
*** Error in `/usr/bin/python': free(): invalid next size (fast): 0x00007fef04033ba0 ***
这是d_grid_gpu.cc文件:
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
REGISTER_OP("DGridGpu")
.Input("grid: int32")
.Attr("start_x: int = 0")
.Attr("start_y: int = 0")
.Attr("stop_x: int = 28")
.Attr("stop_y: int = 28")
.Attr("size_x: int = 28")
.Attr("size_y: int = 28")
.Attr("wall_height: float = 2.5")
.Output("prev: int32");
using namespace tensorflow;
void run();
class DGridGpuOp : public OpKernel {
public:
explicit DGridGpuOp(OpKernelConstruction* context) : OpKernel(context) {
}
void Compute(OpKernelContext* context) override {
run();
}
};
REGISTER_KERNEL_BUILDER(Name("DGridGpu").Device(DEVICE_GPU), DGridGpuOp);
这是d_grid_gpu.cu.cc文件:
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
// content here
#include <stdio.h>
#define SIZE 1024
__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
int i = threadIdx.x;
if (i < n)
c[i] = a[i] + b[i];
}
void run() {
int *a, *b, *c;
int *d_a, *d_b, *d_c;
a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
c = (int *)malloc(SIZE*sizeof(int));
cudaMalloc( &d_a, SIZE*sizeof(int));
cudaMalloc( &d_b, SIZE*sizeof(int));
cudaMalloc( &d_c, SIZE*sizeof(int));
for( int i = 0; i < SIZE; ++i )
{
a[i] = i;
b[i] = i;
c[i] = 0;
}
cudaMemcpy( d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice );
// blocks, threads
VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
cudaMemcpy( c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost );
for( int i = 0; i < 10; ++i)
printf("output : c[%d] = %d\n", i, c[i]);
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
#endif
这是我用来构建op的代码:
TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
nvcc -std=c++11 -c -o d_grid_gpu.cu.o d_grid_gpu.cu.cc \
-I $TF_INC -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC --expt-relaxed-constexpr
g++ -std=c++11 -shared -o d_grid_gpu.so d_grid_gpu.cc \
d_grid_gpu.cu.o -I $TF_INC -fPIC -lcudart -D_GLIBCXX_USE_CXX11_ABI=0 -L /usr/lib/x86_64-linux-gnu/
这就是我所拥有的一切。正如我所说,cuda代码什么都不做,但整个操作编译。我有python代码调用这个我没有包含的库。我相信我的cuda正在发挥作用。我正在使用ubuntu 16.10和cuda 8
编辑 - 转储前的一些错误:
*** Error in `/usr/bin/python': free(): invalid next size (fast): 0x00007f34f4033ba0 ***
======= Backtrace: =========
/lib/x86_64-linux-gnu/libc.so.6(+0x790cb)[0x7f35664f20cb]
/lib/x86_64-linux-gnu/libc.so.6(+0x8275a)[0x7f35664fb75a]
/lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7f35664ff18c]
/usr/local/lib/python2.7/dist-packages/tensorflow/python/_pywrap_tensorflow.so(+0x22223a1)[0x7f354d7953a1]
/usr/local/lib/python2.7/dist-packages/tensorflow/python/_pywrap_tensorflow.so(+0x222b6a2)[0x7f354d79e6a2]
/usr/local/lib/python2.7/dist-packages/tensorflow/python/_pywrap_tensorflow.so(+0x221fd90)[0x7f354d792d90]
/usr/local/lib/python2.7/dist-packages/tensorflow/python/_pywrap_tensorflow.so(_ZN5Eigen26NonBlockingThreadPoolTemplIN10tensorflow6thread16EigenEnvironmentEE10WorkerLoopEi+0x3c8)[0x7f354d9f4ce8]
/usr/local/lib/python2.7/dist-packages/tensorflow/python/_pywrap_tensorflow.so(_ZNSt17_Function_handlerIFvvEZN10tensorflow6thread16EigenEnvironment12CreateThreadESt8functionIS0_EEUlvE_E9_M_invokeERKSt9_Any_data+0x22)[0x7f354d9f44b2]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xbb8f0)[0x7f354b0408f0]
/lib/x86_64-linux-gnu/libpthread.so.0(+0x770a)[0x7f356684770a]
/lib/x86_64-linux-gnu/libc.so.6(clone+0x5f)[0x7f35665810af]
======= Memory map: ========
200000000-200100000 rw-s 3cf997000 00:06 570 /dev/nvidiactl
... more memory map here...
我希望这会有所帮助。我尝试了这一点,我认为它有效,但我无法重现结果。
编辑:我稍微更改了我的代码但仍然获得内存转储。
d_grid_gpu.cc
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
REGISTER_OP("DGridGpu")
.Input("grid: int32")
.Output("prev: int32");
using namespace tensorflow;
void run(const int * in, int * out);
class DGridGpuOp : public OpKernel {
public:
explicit DGridGpuOp(OpKernelConstruction* context) : OpKernel(context) {
}
void Compute(OpKernelContext* context) override {
Tensor* prev_h = NULL;
const Tensor& grid_h = context->input(0);
auto grid = grid_h.flat<int32>();
OP_REQUIRES_OK(context, context->allocate_output(
0,
TensorShape({64}), &prev_h));
auto prev = prev_h->flat<int32>();
run(grid.data(), prev.data()); // do something to grid_host and move it to prev_host
//exit
}
};
REGISTER_KERNEL_BUILDER(Name("DGridGpu").Device(DEVICE_GPU), DGridGpuOp);
//REGISTER_KERNEL_BUILDER(Name("DGridGpu").Device(DEVICE_CPU), DGridGpuOp);
d_grid_gpu.cu.cc
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include <stdio.h>
#define SIZE 20
__global__ void VectorAdd( const int *in, int *out, int n)
{
int i = threadIdx.x;
if (i < n)
out[i] = in[i] + out[i];
}
void run(const int * in, int * out) {
VectorAdd<<< 1, SIZE >>>( in, out, SIZE);
}
#endif
答案 0 :(得分:0)
如果我按如下方式更改d_grid_gpu.cc,我可以获得&#39; run()&#39;没有内存转储的方法。最重要的是REGISTER_KERNEL_BUILDER&#39; REGISTER_KERNEL_BUILDER&#39;线。现在它包含&#39; DEVICE_CPU&#39;规范而不是&DEVICES_GPU&#39;。虽然我觉得这不是tensorflow开发者会这样做的方式。
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
REGISTER_OP("DGridGpu")
.Input("grid: int32")
.Attr("start_x: int = 0")
.Attr("start_y: int = 0")
.Attr("stop_x: int = 28")
.Attr("stop_y: int = 28")
.Attr("size_x: int = 28")
.Attr("size_y: int = 28")
.Attr("wall_height: float = 2.5")
.Output("prev: int32");
using namespace tensorflow;
void run();
class DGridGpuOp : public OpKernel {
public:
explicit DGridGpuOp(OpKernelConstruction* context) : OpKernel(context) {
}
void Compute(OpKernelContext* context) override {
Tensor grid;
Tensor * prev;
grid = context->input(0);
auto grid_host = grid.template flat<int32>();
OP_REQUIRES_OK(context, context->allocate_output(
0,
TensorShape({64}), &prev));
auto prev_host = prev->flat<int32>();
run(); // do something to grid_host and move it to grid_prev
//exit
}
};
REGISTER_KERNEL_BUILDER(Name("DGridGpu").Device(DEVICE_CPU), DGridGpuOp);
答案 1 :(得分:0)
简而言之,更大的问题是你试图自己管理内存,但Tensorflow已经知道如何为你做这件事。您应该使用Tensorflow的机制来管理内存;您不需要任何malloc
,free
,cudaMalloc
,cudaFree
,cudaMemcpy
代码。
我首先要从教程中修改GPU内核:
https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cc https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/g3doc/how_tos/adding_an_op/cuda_op_kernel.cu.cc
内核接收已在GPU内存中分配的缓冲区作为输入。您只需将其地址传递给GPU内核即可。
要为输出分配缓冲区,您应该使用OpKernelContext::allocate_output()
分配Tensor并将其地址传递给GPU内核。还有allocate_temp()
用于分配临时缓冲区。上面的例子以这种方式分配其输出。默认情况下,在GPU上,这会在GPU内存中分配缓冲区。因此,无需自己分配内存或将内容从设备复制到主机。
您当前正在填充缓冲区作为内核输入,然后手动将其复制到GPU。使用GPU填充缓冲区或使用单独的Tensorflow CPU 运算符创建输入可能最简单; Tensorflow负责主持人 - &gt;设备必要时复制。
我希望这有帮助!