我正在尝试使用cublas库来实现自定义TensorFlow op,但是在集成到TensorFlow op内核中时,使用cublasSasum
会引起segmenatation错误。
我首先成功地编译并测试了head中的cuda_op_kernel.cpp
示例(使用cuda_op_kernel.cpp和cuda_op_kernel.cu.cc编译)。效果很好,可以使用tf.load_op_library()
从python导入和使用。
第二,我已经在CUDA中实现了op功能,使用nvcc进行了编译并执行了它。此代码包括cublasSasum
和cublasSgemm
这两个操作,它们可以很好地编译并产生所需的结果。
但是,当我尝试将此内核替换为cuda_op_kernel.cpp
示例时,op似乎可以正常编译,但是随后当由python导入和使用时,op在运行时会产生分段错误。除了以下内容外,没有提供其他信息:
Segmentation fault (core dumped)
我不确定cublas和TensorFlow C ++库之间是否存在兼容性问题,因为如果我删除了cublasSasum
操作,一切正常,并且cublasSgemm
会产生所需的输出。
在使用nvcc进行编译期间,与Eigen库有关的错误,但是在没有cublasSasum
的情况下也会发生,并且一切正常。
nvcc输出:
/home/user/.conda/envs/tensorflow/lib/python3.6/site-packages/tensorflow/include/unsupported/Eigen/CXX11/../src/SpecialFunctions/SpecialFunctionsImpl.h(651): warning: missing return statement at end of non-void function "Eigen::internal::igammac_cf_impl<Scalar, mode>::run [with Scalar=float, mode=Eigen::internal::VALUE]"
detected during:
instantiation of "Scalar Eigen::internal::igammac_cf_impl<Scalar, mode>::run(Scalar, Scalar) [with Scalar=float, mode=Eigen::internal::VALUE]"
(855): here
instantiation of "Scalar Eigen::internal::igamma_generic_impl<Scalar, mode>::run(Scalar, Scalar) [with Scalar=float, mode=Eigen::internal::VALUE]"
(2096): here
instantiation of "Eigen::internal::igamma_retval<Eigen::internal::global_math_functions_filtering_base<Scalar, void>::type>::type Eigen::numext::igamma(const Scalar &, const Scalar &) [with Scalar=float]"
/home/fg299/.conda/envs/tensorflow/lib/python3.6/site-packages/tensorflow/include/unsupported/Eigen/CXX11/../src/SpecialFunctions/SpecialFunctionsHalf.h(34): here
示例产生错误:
stack.cpp
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
using namespace tensorflow; // NOLINT(build/namespaces)
REGISTER_OP("MyKernel")
.Input("input: float")
.Output("output: float");
void MyKernelLauncher(const float* in, const int N, float* out);
class MyKernelOp : public OpKernel
{
public:
explicit MyKernelOp(OpKernelConstruction* context) : OpKernel(context) {}
void Compute(OpKernelContext* context) override
{
// Grab the input tensor and get shapes
const Tensor& input_tensor = context->input(0);
const TensorShape& input_shape = input_tensor.shape();
int N = input_shape.dim_size(0)*input_shape.dim_size(1);
std::cout<<N<<std::endl;
// get intput as eigen tensor.
auto input = input_tensor.flat<float>();
//Create an output tensor
Tensor* output_tensor = nullptr;
// create output shape
TensorShape output_shape;
output_shape.AddDim(1);
// create output tensor
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
auto output = output_tensor->flat<float>();
// call the kernel
MyKernelLauncher(input.data(), N, output.data());
}
};
REGISTER_KERNEL_BUILDER(Name("MyKernel").Device(DEVICE_GPU), MyKernelOp);
stack.cu
#include <cuda_runtime.h>
#include <cublas_v2.h>
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
/* main function */
void MyKernelLauncher(const float* d_w1, const int N, float* h_ans)
{
/* Initialize CUBLAS */
cublasStatus_t status;
cublasHandle_t handle;
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
cudaDeviceReset();
}
printf("STARTING SUM\n");
/* perform cublasSasum*/
status = cublasSasum(handle, N, d_w1, 1, h_ans);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("cublasSasum failed\n");
cudaDeviceReset();
}
printf("DONE SUM\n");
/* remove monitoring handle */
cublasDestroy(handle);
}
#endif
test.py
import tensorflow as tf
import numpy as np
my_module = tf.load_op_library("./stack.so")
b = np.ones((5,5))
with tf.Session() as sess:
ans = my_module.my_kernel(b)
print(ans.eval())
正如我提到的,如果我使用cublasSasum
以外的其他cublas函数,则此方法很好用。