在eclipse(ubuntu)中使用JNI for CUDA从.o文件生成.so文件时出错

时间:2015-09-16 12:47:03

标签: cuda java-native-interface g++ nvcc

我正在研究基于JNI的CUDA程序,我有一个拥有主要功能的java类,即jclass.java(包含原生的func jniEntry()声明) 和jclass.h,它是从javah生成的。我有JNI桥cEntry.c,它包含声明为的本机函数实现 JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv * env,jobject thisObj) 上面的函数调用CUDA主机函数,即cudaprogram.h中的jniEntry()。然后jniEntry()函数调用cudaprogram.cu

中包含的设备函数

我似乎无法从生成的.o文件生成.so文件,即来自cudaprogram.cu的cudaprogram.o和来自cEntry.c的cEntry.o(这是JNI的桥梁,即jclass.java - &gt ; jclass.class(来自javac)& jclass.h(来自javah -jni) 我的makefile是:

INCLUDES    := -I$(CUDASDK_PATH)/inc -I$(CUDA_PATH)/include  -I$(JDK_PATH)/include -I$(JDK_PATH)/include/linux -I.
    LIBRARIES   := -lrt -lm  -lcudart -lcufft -lcublas -L$(CUDA_PATH)/lib64  -L.
    JAVASRC_PATH := ../ 
    NATIVESRC_PATH := ./ 
    NVCC := /opt/cuda-6.5//bin/nvcc -ccbin g++ 


    cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
        g++  $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o 

    cEntry.o: cEntry.c jclass.h cudaprog.o 
        gcc $(INCLUDES) -v  -m64 -fPIC -o $@  cEntry.c -c

    cudaprog.o: cudaprog.cu jclass.h  cudaprog.h
        $(NVCC) $(INCLUDES) -v -m64  -o $@ -c cudaprog.cu   

    run: build
        $(EXEC) ./cujni1

    jclass.h:jclass.class
        javah -jni -classpath $(JAVASRC_PATH) jclass

    jclass.class: 
        javac $(JAVASRC_PATH)/jclass.java

生成没有错误的文件是jclass.class,jclass.h,cudaprogram.o,cEntry.o但是libcujni1.so没有生成,因为我得到的错误类似于

/usr/bin/ld: cudaprog.o: relocation R_X86_64_32 against `.rodata' can not be used when making a shared object; recompile with -fPIC
cudaprog.o: error adding symbols: Bad value
collect2: error: ld returned 1 exit status
make: *** [cujni1] Error 1

你可以看到我正在使用nvcc编译.cu文件所以不能使用-fPIC选项,因为它返回错误的|未知选项-fPIC"

在需要的参考我也附加其他源文件 jclass.java:

public class jclass {

    static {
        System.loadLibrary("cujni1");
        }

    private native void jniEntry();

    public static void main(String[] args){

        System.out.print("1:Hello" + "JNI CUder\n");
        new jclass().jniEntry();
    }
}

cEntry.c:

#include <jni.h>
#include "jclass.h"
#include "cudaprog.h"

JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv* env, jobject thisObj)
{
       printf("2:cEntry.c-->Java_jclass_jniEntry!\n");
    jniEntry();
    return;
}

生成的jclass.h:

#ifndef CUDAPROG_H_
#define CUDAPROG_H_

#ifdef __cplusplus
    extern "C" {
#endif
    void jniEntry();
#ifdef __cplusplus
    }
#endif

#endif /* CUDAPROG_H_ */

cudaprogram.cu:

// includes, system
#include <string.h>
#include <math.h>

#include "jclass.h"
#include "cudaprog.h"

#include <stdio.h>
#include <iostream>
#include <stdlib.h>     /* srand, rand */
#include <time.h>       /* time */
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#ifdef __cplusplus
extern "C"
{
#endif

#define LO -100.0f
#define HI 100.0f
#define BlockSize 16
#define VECTORLENGTH 100
#define MATRIXLENGTH 4000


__global__ void
calDistanceMatrixCUDA(float *Out, float *In)
{
    // Block index
//  int bx = blockIdx.x;
//  int by = blockIdx.y;

    // Thread index
//  int tx = threadIdx.x;
//  int ty = threadIdx.y;

    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;


    if (i < MATRIXLENGTH && j < MATRIXLENGTH)
    {

        float fDim = 0.0f;
        float fDist = 0.0f;
        float(&InM)[4000][100] = *reinterpret_cast<float(*)[4000][100]>(In);
        float(&OutM)[4000][4000] = *reinterpret_cast<float(*)[4000][4000]>(Out);
        for (int k = 0; k < VECTORLENGTH; k++){//not blockSize because numElements = 100 < 128
            fDim = InM[i][k] - InM[j][k];
            fDim *= fDim;
            fDist += fDim;
        }
        fDist = sqrt(fDist);
        OutM[i][j] = fDist;

    }

}

#ifdef __cplusplus
}
#endif

#ifdef __cplusplus
extern "C"
{
#endif

void jniEntry()
{

    clock_t time1, time2, time3, time4;
    double tDiff1, tDiff2, tDiff3, tDiff4;

    unsigned int numElements = VECTORLENGTH;//dims
    unsigned int numVectors = MATRIXLENGTH;

    dim3 dimsVector(VECTORLENGTH, 1, 1);
    dim3 dimsVectorArray(MATRIXLENGTH, VECTORLENGTH, 1);
    dim3 dimsDistMatrix(MATRIXLENGTH, MATRIXLENGTH, 1);

    size_t sizeVector = VECTORLENGTH * sizeof(float);
    size_t sizeVectorArray = sizeVector * MATRIXLENGTH;
    size_t sizeMatrix = MATRIXLENGTH * MATRIXLENGTH * sizeof(float);

    unsigned int nSizeVector = dimsVector.x * dimsVector.y;
    unsigned int mem_SizeVector = sizeof(float) * nSizeVector;
    unsigned int nSizeVectorArray = dimsVectorArray.x * dimsVectorArray.y;
    unsigned int mem_SizeVectorArray = sizeof(float) * nSizeVectorArray;
    unsigned int nSizeDistMatrix = dimsDistMatrix.x * dimsDistMatrix.y;
    unsigned int mem_SizeDistMatrix = sizeof(float) * nSizeDistMatrix;

    float *distMatrix = (float *)malloc(mem_SizeDistMatrix);///Destination
    /////////////////////////////////////////
    ///initialize Vector
    time1 = clock();

    float *featureV100 = (float *)malloc(mem_SizeVectorArray);
    for (int i = 0; i < nSizeVectorArray; ++i)
    {
        featureV100[i] = LO + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (HI - LO)));;
//      printf("i:%d, == %5.2f\n", i, featureV100[i]);
    }
    time2 = clock();

    ///////////////////////////
    float *d_featureV100, *d_DistMatrix;

    cudaError_t error;
    error = cudaMalloc((void **)&d_featureV100, mem_SizeVectorArray);
    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_featureV100 returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **)&d_DistMatrix, mem_SizeDistMatrix);
    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_DistMatrix returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }


    error = cudaMemcpy(d_featureV100, featureV100, mem_SizeVectorArray, cudaMemcpyHostToDevice);
    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_featureV100,featureV100) returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    //////////////////////
    // Allocate CUDA events that we'll use for timing
    cudaEvent_t start;
    error = cudaEventCreate(&start);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    cudaEvent_t stop;
    error = cudaEventCreate(&stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    // Record the start event
    error = cudaEventRecord(start, NULL);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Setup execution parameters
//  int threads = /*128*/512; //sufficient for vector of 100 elements
    dim3 threads(512); //sufficient for vector of 100 elements
//  dim3 grid(MATRIXLENGTH / threads, MATRIXLENGTH / threads);
    dim3 grid(512);

    calDistanceMatrixCUDA<<<grid, threads>>>(d_DistMatrix, d_featureV100);

    // Record the stop event
    error = cudaEventRecord(stop, NULL);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Wait for the stop event to complete
    error = cudaEventSynchronize(stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    float msecTotal = 0.0f;
    error = cudaEventElapsedTime(&msecTotal, start, stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Compute and print the performance
    float msec = msecTotal ;
    printf(
        "Performance= Time= %.3f msec, WorkgroupSize= %d,%d,%d threads/block & %d,%d,%d blocks/grid\n",
        msec,
        threads.x,threads.y,threads.z,
        grid.x,grid.y,grid.z);

    error = cudaGetLastError();

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch calDistanceMatrixCUDA (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }


    error = cudaMemcpy(distMatrix, d_DistMatrix, mem_SizeDistMatrix, cudaMemcpyDeviceToHost);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy d_DistMatrix from device to host distMatrix (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }


    cudaFree(d_featureV100);
    cudaFree(d_DistMatrix);
    free(featureV100);
    free(distMatrix);

    error = cudaDeviceReset();

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");

}

#ifdef __cplusplus
}
#endif

毋庸置疑,上面的cudaprogam.cu在作为CUDA应用程序运行时运行时没有错误,即没有JNI 请指导我在makefile中使用正确的选项,因为我是创建makefile的新手。谢谢。

修改

在您在答案中提到的更改之后。 ldd命令给出了

ldd libcujni1.so 
    linux-vdso.so.1 =>  (0x00007ffd919b6000)
    libcudart.so.6.5 => /opt/cuda-6.5//lib64/libcudart.so.6.5 (0x00007f47bde41000)
    libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f47bdb3d000)
    libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f47bd778000)
    libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f47bd574000)
    libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f47bd356000)
    librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f47bd14e000)
    libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f47bce48000)
    /lib64/ld-linux-x86-64.so.2 (0x00007f47be297000)
    libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f47bcc32000)

,make命令行显示

    make all 
javac ..//jclass.java
javah -jni -classpath ../ jclass
/opt/cuda-6.5//bin/nvcc -ccbin g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include  -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -Xcompiler -fPIC -m64 -o cudaprog.o -c *.cu #  -v 
cudaprog.cu(89): warning: variable "time1" was set but never used

cudaprog.cu(89): warning: variable "time2" was set but never used

cudaprog.cu(89): warning: variable "time3" was declared but never referenced

cudaprog.cu(89): warning: variable "time4" was declared but never referenced

cudaprog.cu(90): warning: variable "tDiff1" was declared but never referenced

cudaprog.cu(90): warning: variable "tDiff2" was declared but never referenced

cudaprog.cu(90): warning: variable "tDiff3" was declared but never referenced

cudaprog.cu(90): warning: variable "tDiff4" was declared but never referenced

cudaprog.cu(92): warning: variable "numElements" was declared but never referenced

cudaprog.cu(93): warning: variable "numVectors" was declared but never referenced

cudaprog.cu(100): warning: variable "sizeVectorArray" was declared but never referenced

cudaprog.cu(101): warning: variable "sizeMatrix" was declared but never referenced

cudaprog.cu(104): warning: variable "mem_SizeVector" was declared but never referenced

cudaprog.cu(89): warning: variable "time1" was set but never used

cudaprog.cu(89): warning: variable "time2" was set but never used

cudaprog.cu(89): warning: variable "time3" was declared but never referenced

cudaprog.cu(89): warning: variable "time4" was declared but never referenced

cudaprog.cu(90): warning: variable "tDiff1" was declared but never referenced

cudaprog.cu(90): warning: variable "tDiff2" was declared but never referenced

cudaprog.cu(90): warning: variable "tDiff3" was declared but never referenced

cudaprog.cu(90): warning: variable "tDiff4" was declared but never referenced

cudaprog.cu(92): warning: variable "numElements" was declared but never referenced

cudaprog.cu(93): warning: variable "numVectors" was declared but never referenced

cudaprog.cu(100): warning: variable "sizeVectorArray" was declared but never referenced

cudaprog.cu(101): warning: variable "sizeMatrix" was declared but never referenced

cudaprog.cu(104): warning: variable "mem_SizeVector" was declared but never referenced

g++  -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include  -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -shared  -fPIC -m64 -o cEntry.o  cEntry.c  jclass.h  cudaprog.h #  -shared  -fPIC -Xlinker -znoexecstack -Xlinker -shared  -v -g 
g++  -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include  -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o -L/opt/cuda-6.5//lib64 -Wl,-rpath=/opt/cuda-6.5//lib64 -lcufft -lcublas -lcudart -lcuda -lrt -lm #  -v

还添加了库文件夹,即

LIBRARIES   := -L$(CUDA_PATH)/lib64 -Wl,-rpath=$(CUDA_PATH)/lib64  -L$(CUDA_PATH)/lib64/stubs -Wl,-rpath=$(CUDA_PATH)/lib64/stubs -lcufft -lcublas -lcudart -lcuda -lrt -lm

当前错误(在jclass.java main()

中运行输出命令之后)
    Exception in thread "main" 1:HelloJNI CUder
java.lang.UnsatisfiedLinkError: jclass.jniEntry()V
    at jclass.jniEntry(Native Method)
    at jclass.main(jclass.java:22)

1 个答案:

答案 0 :(得分:1)

发表正确答案,因为评论并非意味着......

您的第一个问题是您在nvcc编译器选项列表中缺少-Xcompiler - fpic

您的第二个问题是您的动态库既不与libcudart也不与libcuda链接。这可能是Makefile的问题,也可能是lib链接的顺序。

我会尝试像-L$(CUDA_PATH)/lib64 -Wl,-rpath=$(CUDA_PATH)/lib64 -lcufft -lcublas -lcudart -lcuda -lrt -lm这样的链接选项......

然后用ldd libcujni1.so检查libcudart和libcuda确实列在那里。

请在您的初始问题中发布实际链接命令行的副本(在您键入make时执行的命令行)和ldd libcujni1.so的结果。

编辑:我想我已经知道了...只需重新阅读你的Makefile,你就应该改变它:

    cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
        g++  $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o

进入这个:

    cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
        g++  $(INCLUDES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o $(LIBRARIES)

注意$(LIBRARIES)的地点变更......在链接方面,订单很重要(很多)。