标签: c++ cuda pytorch interceptor


如果正在运行的应用程序动态链接到CUDA运行时库(libcudart.so),则可以使用"LD_PRELOAD trick"钩住CUDA运行时API调用(在Linux上)。


$ cat mylib.cpp
#include <stdio.h>
#include <unistd.h>
#include <dlfcn.h>
#include <cuda_runtime.h>

cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
cudaError_t (*lcudaMemcpy) ( void*, const void*, size_t, cudaMemcpyKind) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind  ))dlsym(RTLD_NEXT, "cudaMemcpy");
    printf("cudaMemcpy hooked\n");
    return lcudaMemcpy( dst, src, count, kind );

cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t str )
cudaError_t (*lcudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind, cudaStream_t   ))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
    printf("cudaMemcpyAsync hooked\n");
    return lcudaMemcpyAsync( dst, src, count, kind, str );
$ g++ -I/usr/local/cuda/include -fPIC -shared -o libmylib.so mylib.cpp -ldl -L/usr/local/cuda/lib64 -lcudart
$ cat t1.cu
#include <stdio.h>

int main(){

  int a, *d_a;
  cudaMalloc(&d_a, sizeof(d_a[0]));
  cudaMemcpy(d_a, &a, sizeof(a), cudaMemcpyHostToDevice);
  cudaStream_t str;
  cudaMemcpyAsync(d_a, &a, sizeof(a), cudaMemcpyHostToDevice);
  cudaMemcpyAsync(d_a, &a, sizeof(a), cudaMemcpyHostToDevice, str);
$ nvcc -o t1 t1.cu -cudart shared
$ LD_LIBRARY_PATH=/usr/local/cuda/lib64 LD_PRELOAD=./libmylib.so cuda-memcheck ./t1
cudaMemcpy hooked
cudaMemcpyAsync hooked
cudaMemcpyAsync hooked
========= ERROR SUMMARY: 0 errors

(CentOS 7,CUDA 10.2)


$ docker run --gpus all -it nvcr.io/nvidia/pytorch:20.08-py3
Status: Downloaded newer image for nvcr.io/nvidia/pytorch:20.08-py3

root@946934df529b:/workspace# cat mylib.cpp
#include <stdio.h>
#include <unistd.h>
#include <dlfcn.h>
#include <cuda_runtime.h>

cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
cudaError_t (*lcudaMemcpy) ( void*, const void*, size_t, cudaMemcpyKind) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind  ))dlsym(RTLD_NEXT, "cudaMemcpy");
    printf("cudaMemcpy hooked\n");
    return lcudaMemcpy( dst, src, count, kind );

cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t str )
cudaError_t (*lcudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind, cudaStream_t   ))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
    printf("cudaMemcpyAsync hooked\n");
    return lcudaMemcpyAsync( dst, src, count, kind, str );
root@946934df529b:/workspace# g++ -I/usr/local/cuda/include -fPIC -shared -o libmylib.so mylib.cpp -ldl -L/usr/local/cuda/lib64 -lcudart
root@946934df529b:/workspace# cat tt.py
import torch
device = torch.cuda.current_device()
x = torch.randn(1024, 1024).to(device)
y = torch.randn(1024, 1024).to(device)
z = torch.matmul(x, y)
root@946934df529b:/workspace# LD_LIBRARY_PATH=/usr/local/cuda/lib64 LD_PRELOAD=./libmylib.so python tt.py
cudaMemcpyAsync hooked
cudaMemcpyAsync hooked

