我目前正在使用GASS的CUDA.NET库。 我需要在一个CPU线程中初始化cuda数组(实际上是cublas向量,但这并不重要),并在其他CPU线程中使用它们。但是,包含所有初始化数组和加载函数的CUDA上下文只能附加到一个CPU线程。
有一种称为上下文迁移API的机制,用于从一个线程分离上下文并将其附加到另一个线程。但我不知道如何在CUDA.NET中正确使用它。
我试过这样的事情:
class Program
{
private static float[] vector1, vector2;
private static CUDA cuda;
private static CUBLAS cublas;
private static CUdeviceptr ptr;
static void Main(string[] args)
{
cuda = new CUDA(false);
cublas = new CUBLAS(cuda);
cuda.Init();
cuda.CreateContext(0);
AllocateVectors();
cuda.DetachContext();
CUcontext context = cuda.PopCurrentContext();
GetVectorFromDeviceAsync(context);
}
private static void AllocateVectors()
{
vector1 = new float[]{1f, 2f, 3f, 4f, 5f};
ptr = cublas.Allocate(vector1.Length, sizeof (float));
cublas.SetVector(vector1, ptr);
vector2 = new float[5];
}
private static void GetVectorFromDevice(object objContext)
{
CUcontext localContext = (CUcontext) objContext;
cuda.PushCurrentContext(localContext);
cuda.AttachContext(localContext);
//change vector somehow
vector1[0] = -1;
//copy changed vector to device
cublas.SetVector(vector1, ptr);
cublas.GetVector(ptr, vector2);
CUDADriver.cuCtxPopCurrent(ref localContext);
}
private static void GetVectorFromDeviceAsync(CUcontext cUcontext)
{
Thread thread = new Thread(GetVectorFromDevice);
thread.IsBackground = false;
thread.Start(cUcontext);
}
}
但尝试将更改的向量复制到设备时执行失败,因为未附加上下文。其他原因不太可能,因为它在单线程模式下工作正常。任何想法我怎么能让它工作?
答案 0 :(得分:2)
我仍然没有找到解决此问题的方法,但我确实想出了一个解决方法。 重点是在一个CPU线程中执行所有具有处理CUDA功能的函数。 例如,您可以这样做:
class Program
{
private static float[] vector1, vector2;
private static CUDA cuda;
private static CUBLAS cublas;
private static CUdeviceptr ptr;
private static readonly AutoResetEvent autoResetEvent = new AutoResetEvent(false);
static void Main()
{
cuda = new CUDA(true);
cublas = new CUBLAS(cuda);
//allocate vector on cuda device in main thread
CudaManager.CallMethod(AllocateVectors);
//changing first vector from other thread
Thread changeThread = new Thread(ChangeVectorOnDevice_ThreadRun) { IsBackground = false };
changeThread.Start();
//wait for changeThread to finish
autoResetEvent.WaitOne();
//getting vector from device in another one thread
Thread getThread = new Thread(GetVectorFromDevice_ThreadRun) { IsBackground = false };
getThread.Start();
//wait for getThread to finish
autoResetEvent.WaitOne();
Console.WriteLine("({0}, {1}, {2}, {3}, {4})", vector2[0], vector2[1], vector2[2], vector2[3], vector2[4]);
Console.ReadKey(true);
}
private static void AllocateVectors()
{
vector1 = new[] { 1f, 2f, 3f, 4f, 5f };
vector2 = new float[5];
//allocate memory and copy first vector to device
ptr = cublas.Allocate(vector1.Length, sizeof(float));
cublas.SetVector(vector1, ptr);
}
private static void GetVectorFromDevice()
{
cublas.GetVector(ptr, vector2);
}
private static void ChangeVectorOnDevice()
{
//changing vector and copying it to device
vector1 = new[] { -1f, -2f, -3f, -4f, -5f };
cublas.SetVector(vector1, ptr);
}
private static void ChangeVectorOnDevice_ThreadRun()
{
CudaManager.CallMethod(ChangeVectorOnDevice);
//releasing main thread
autoResetEvent.Set();
}
private static void GetVectorFromDevice_ThreadRun()
{
CudaManager.CallMethod(GetVectorFromDevice);
//releasing main thread
autoResetEvent.Set();
}
}
public static class CudaManager
{
public static Action WorkMethod { get; private set; }
private static readonly AutoResetEvent actionRecived = new AutoResetEvent(false);
private static readonly AutoResetEvent callbackEvent = new AutoResetEvent(false);
private static readonly object mutext = new object();
private static bool isCudaThreadRunning;
private static void ThreadRun()
{
//waiting for work method to execute
while (actionRecived.WaitOne())
{
//invoking recived method
WorkMethod.Invoke();
//releasing caller thread
callbackEvent.Set();
}
}
static CudaManager()
{
Run();
}
public static void Run()
{
if (!isCudaThreadRunning)
{
Thread thread = new Thread(ThreadRun);
thread.IsBackground = true;
thread.Start();
isCudaThreadRunning = true;
}
}
public static void CallMethod(Action method)
{
lock (mutext)
{
WorkMethod = method;
//releasing ThreadRun method
actionRecived.Set();
//blocking caller thread untill delegate invokation is complete
callbackEvent.WaitOne();
}
}
}
我希望它会帮助别人。
答案 1 :(得分:1)
查看GASS文档中的CUDAContextSynchronizer类。