我正在研究基于JNI的CUDA程序,我有一个拥有主要功能的java类,即jclass.java(包含原生的func jniEntry()声明) 和jclass.h,它是从javah生成的。我有JNI桥cEntry.c,它包含声明为的本机函数实现 JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv * env,jobject thisObj) 上面的函数调用CUDA主机函数,即cudaprogram.h中的jniEntry()。然后jniEntry()函数调用cudaprogram.cu
中包含的设备函数我似乎无法从生成的.o文件生成.so文件,即来自cudaprogram.cu的cudaprogram.o和来自cEntry.c的cEntry.o(这是JNI的桥梁,即jclass.java - &gt ; jclass.class(来自javac)& jclass.h(来自javah -jni) 我的makefile是:
INCLUDES := -I$(CUDASDK_PATH)/inc -I$(CUDA_PATH)/include -I$(JDK_PATH)/include -I$(JDK_PATH)/include/linux -I.
LIBRARIES := -lrt -lm -lcudart -lcufft -lcublas -L$(CUDA_PATH)/lib64 -L.
JAVASRC_PATH := ../
NATIVESRC_PATH := ./
NVCC := /opt/cuda-6.5//bin/nvcc -ccbin g++
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o
cEntry.o: cEntry.c jclass.h cudaprog.o
gcc $(INCLUDES) -v -m64 -fPIC -o $@ cEntry.c -c
cudaprog.o: cudaprog.cu jclass.h cudaprog.h
$(NVCC) $(INCLUDES) -v -m64 -o $@ -c cudaprog.cu
run: build
$(EXEC) ./cujni1
jclass.h:jclass.class
javah -jni -classpath $(JAVASRC_PATH) jclass
jclass.class:
javac $(JAVASRC_PATH)/jclass.java
生成没有错误的文件是jclass.class,jclass.h,cudaprogram.o,cEntry.o但是libcujni1.so没有生成,因为我得到的错误类似于
/usr/bin/ld: cudaprog.o: relocation R_X86_64_32 against `.rodata' can not be used when making a shared object; recompile with -fPIC
cudaprog.o: error adding symbols: Bad value
collect2: error: ld returned 1 exit status
make: *** [cujni1] Error 1
你可以看到我正在使用nvcc编译.cu文件所以不能使用-fPIC选项,因为它返回错误的|未知选项-fPIC"
在需要的参考我也附加其他源文件 jclass.java:
public class jclass {
static {
System.loadLibrary("cujni1");
}
private native void jniEntry();
public static void main(String[] args){
System.out.print("1:Hello" + "JNI CUder\n");
new jclass().jniEntry();
}
}
cEntry.c:
#include <jni.h>
#include "jclass.h"
#include "cudaprog.h"
JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv* env, jobject thisObj)
{
printf("2:cEntry.c-->Java_jclass_jniEntry!\n");
jniEntry();
return;
}
生成的jclass.h:
#ifndef CUDAPROG_H_
#define CUDAPROG_H_
#ifdef __cplusplus
extern "C" {
#endif
void jniEntry();
#ifdef __cplusplus
}
#endif
#endif /* CUDAPROG_H_ */
cudaprogram.cu:
// includes, system
#include <string.h>
#include <math.h>
#include "jclass.h"
#include "cudaprog.h"
#include <stdio.h>
#include <iostream>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#ifdef __cplusplus
extern "C"
{
#endif
#define LO -100.0f
#define HI 100.0f
#define BlockSize 16
#define VECTORLENGTH 100
#define MATRIXLENGTH 4000
__global__ void
calDistanceMatrixCUDA(float *Out, float *In)
{
// Block index
// int bx = blockIdx.x;
// int by = blockIdx.y;
// Thread index
// int tx = threadIdx.x;
// int ty = threadIdx.y;
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < MATRIXLENGTH && j < MATRIXLENGTH)
{
float fDim = 0.0f;
float fDist = 0.0f;
float(&InM)[4000][100] = *reinterpret_cast<float(*)[4000][100]>(In);
float(&OutM)[4000][4000] = *reinterpret_cast<float(*)[4000][4000]>(Out);
for (int k = 0; k < VECTORLENGTH; k++){//not blockSize because numElements = 100 < 128
fDim = InM[i][k] - InM[j][k];
fDim *= fDim;
fDist += fDim;
}
fDist = sqrt(fDist);
OutM[i][j] = fDist;
}
}
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
extern "C"
{
#endif
void jniEntry()
{
clock_t time1, time2, time3, time4;
double tDiff1, tDiff2, tDiff3, tDiff4;
unsigned int numElements = VECTORLENGTH;//dims
unsigned int numVectors = MATRIXLENGTH;
dim3 dimsVector(VECTORLENGTH, 1, 1);
dim3 dimsVectorArray(MATRIXLENGTH, VECTORLENGTH, 1);
dim3 dimsDistMatrix(MATRIXLENGTH, MATRIXLENGTH, 1);
size_t sizeVector = VECTORLENGTH * sizeof(float);
size_t sizeVectorArray = sizeVector * MATRIXLENGTH;
size_t sizeMatrix = MATRIXLENGTH * MATRIXLENGTH * sizeof(float);
unsigned int nSizeVector = dimsVector.x * dimsVector.y;
unsigned int mem_SizeVector = sizeof(float) * nSizeVector;
unsigned int nSizeVectorArray = dimsVectorArray.x * dimsVectorArray.y;
unsigned int mem_SizeVectorArray = sizeof(float) * nSizeVectorArray;
unsigned int nSizeDistMatrix = dimsDistMatrix.x * dimsDistMatrix.y;
unsigned int mem_SizeDistMatrix = sizeof(float) * nSizeDistMatrix;
float *distMatrix = (float *)malloc(mem_SizeDistMatrix);///Destination
/////////////////////////////////////////
///initialize Vector
time1 = clock();
float *featureV100 = (float *)malloc(mem_SizeVectorArray);
for (int i = 0; i < nSizeVectorArray; ++i)
{
featureV100[i] = LO + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (HI - LO)));;
// printf("i:%d, == %5.2f\n", i, featureV100[i]);
}
time2 = clock();
///////////////////////////
float *d_featureV100, *d_DistMatrix;
cudaError_t error;
error = cudaMalloc((void **)&d_featureV100, mem_SizeVectorArray);
if (error != cudaSuccess)
{
printf("cudaMalloc d_featureV100 returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_DistMatrix, mem_SizeDistMatrix);
if (error != cudaSuccess)
{
printf("cudaMalloc d_DistMatrix returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_featureV100, featureV100, mem_SizeVectorArray, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_featureV100,featureV100) returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
//////////////////////
// Allocate CUDA events that we'll use for timing
cudaEvent_t start;
error = cudaEventCreate(&start);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaEvent_t stop;
error = cudaEventCreate(&stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Record the start event
error = cudaEventRecord(start, NULL);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Setup execution parameters
// int threads = /*128*/512; //sufficient for vector of 100 elements
dim3 threads(512); //sufficient for vector of 100 elements
// dim3 grid(MATRIXLENGTH / threads, MATRIXLENGTH / threads);
dim3 grid(512);
calDistanceMatrixCUDA<<<grid, threads>>>(d_DistMatrix, d_featureV100);
// Record the stop event
error = cudaEventRecord(stop, NULL);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Wait for the stop event to complete
error = cudaEventSynchronize(stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
float msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Compute and print the performance
float msec = msecTotal ;
printf(
"Performance= Time= %.3f msec, WorkgroupSize= %d,%d,%d threads/block & %d,%d,%d blocks/grid\n",
msec,
threads.x,threads.y,threads.z,
grid.x,grid.y,grid.z);
error = cudaGetLastError();
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to launch calDistanceMatrixCUDA (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(distMatrix, d_DistMatrix, mem_SizeDistMatrix, cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to copy d_DistMatrix from device to host distMatrix (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaFree(d_featureV100);
cudaFree(d_DistMatrix);
free(featureV100);
free(distMatrix);
error = cudaDeviceReset();
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
printf("Done\n");
}
#ifdef __cplusplus
}
#endif
毋庸置疑,上面的cudaprogam.cu在作为CUDA应用程序运行时运行时没有错误,即没有JNI 请指导我在makefile中使用正确的选项,因为我是创建makefile的新手。谢谢。
修改
在您在答案中提到的更改之后。 ldd命令给出了
ldd libcujni1.so
linux-vdso.so.1 => (0x00007ffd919b6000)
libcudart.so.6.5 => /opt/cuda-6.5//lib64/libcudart.so.6.5 (0x00007f47bde41000)
libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f47bdb3d000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f47bd778000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f47bd574000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f47bd356000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f47bd14e000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f47bce48000)
/lib64/ld-linux-x86-64.so.2 (0x00007f47be297000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f47bcc32000)
,make命令行显示
make all
javac ..//jclass.java
javah -jni -classpath ../ jclass
/opt/cuda-6.5//bin/nvcc -ccbin g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -Xcompiler -fPIC -m64 -o cudaprog.o -c *.cu # -v
cudaprog.cu(89): warning: variable "time1" was set but never used
cudaprog.cu(89): warning: variable "time2" was set but never used
cudaprog.cu(89): warning: variable "time3" was declared but never referenced
cudaprog.cu(89): warning: variable "time4" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff1" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff2" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff3" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff4" was declared but never referenced
cudaprog.cu(92): warning: variable "numElements" was declared but never referenced
cudaprog.cu(93): warning: variable "numVectors" was declared but never referenced
cudaprog.cu(100): warning: variable "sizeVectorArray" was declared but never referenced
cudaprog.cu(101): warning: variable "sizeMatrix" was declared but never referenced
cudaprog.cu(104): warning: variable "mem_SizeVector" was declared but never referenced
cudaprog.cu(89): warning: variable "time1" was set but never used
cudaprog.cu(89): warning: variable "time2" was set but never used
cudaprog.cu(89): warning: variable "time3" was declared but never referenced
cudaprog.cu(89): warning: variable "time4" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff1" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff2" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff3" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff4" was declared but never referenced
cudaprog.cu(92): warning: variable "numElements" was declared but never referenced
cudaprog.cu(93): warning: variable "numVectors" was declared but never referenced
cudaprog.cu(100): warning: variable "sizeVectorArray" was declared but never referenced
cudaprog.cu(101): warning: variable "sizeMatrix" was declared but never referenced
cudaprog.cu(104): warning: variable "mem_SizeVector" was declared but never referenced
g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -shared -fPIC -m64 -o cEntry.o cEntry.c jclass.h cudaprog.h # -shared -fPIC -Xlinker -znoexecstack -Xlinker -shared -v -g
g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o -L/opt/cuda-6.5//lib64 -Wl,-rpath=/opt/cuda-6.5//lib64 -lcufft -lcublas -lcudart -lcuda -lrt -lm # -v
还添加了库文件夹,即
LIBRARIES := -L$(CUDA_PATH)/lib64 -Wl,-rpath=$(CUDA_PATH)/lib64 -L$(CUDA_PATH)/lib64/stubs -Wl,-rpath=$(CUDA_PATH)/lib64/stubs -lcufft -lcublas -lcudart -lcuda -lrt -lm
当前错误(在jclass.java main()
中运行输出命令之后) Exception in thread "main" 1:HelloJNI CUder
java.lang.UnsatisfiedLinkError: jclass.jniEntry()V
at jclass.jniEntry(Native Method)
at jclass.main(jclass.java:22)
答案 0 :(得分:1)
发表正确答案,因为评论并非意味着......
您的第一个问题是您在nvcc编译器选项列表中缺少-Xcompiler - fpic
。
您的第二个问题是您的动态库既不与libcudart也不与libcuda链接。这可能是Makefile的问题,也可能是lib链接的顺序。
我会尝试像-L$(CUDA_PATH)/lib64 -Wl,-rpath=$(CUDA_PATH)/lib64 -lcufft -lcublas -lcudart -lcuda -lrt -lm
这样的链接选项......
然后用ldd libcujni1.so
检查libcudart和libcuda确实列在那里。
请在您的初始问题中发布实际链接命令行的副本(在您键入make
时执行的命令行)和ldd libcujni1.so
的结果。
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o
进入这个:
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o $(LIBRARIES)
注意$(LIBRARIES)
的地点变更......在链接方面,订单很重要(很多)。