我需要将来自pytorch_geometric(流行的PyTorch软件包)的以下Dockerfile从CUDA9.0更改为CUDA10.0。
FROM ubuntu:16.04
RUN apt-get update && apt-get install -y --no-install-recommends apt-utils ca-certificates apt-transport-https gnupg-curl && \
rm -rf /var/lib/apt/lists/* && \
NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
ENV CUDA_VERSION 9.0.176
ENV NCCL_VERSION 2.4.2
ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1
ENV CUDNN_VERSION 7.4.2.24
RUN apt-get update && apt-get install -y --no-install-recommends \
cuda-cudart-$CUDA_PKG_VERSION && \
ln -s cuda-9.0 /usr/local/cuda && \
rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --allow-unauthenticated --no-install-recommends \
cuda-libraries-$CUDA_PKG_VERSION \
libnccl2=$NCCL_VERSION-1+cuda9.0 && \
apt-mark hold libnccl2 && \
rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --allow-unauthenticated --no-install-recommends \
cuda-libraries-dev-$CUDA_PKG_VERSION \
cuda-nvml-dev-$CUDA_PKG_VERSION \
cuda-minimal-build-$CUDA_PKG_VERSION \
cuda-command-line-tools-$CUDA_PKG_VERSION \
cuda-core-9-0=9.0.176.3-1 \
cuda-cublas-dev-9-0=9.0.176.4-1 \
libnccl-dev=$NCCL_VERSION-1+cuda9.0 && \
rm -rf /var/lib/apt/lists/*
ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
# NVIDIA docker 1.0.
LABEL com.nvidia.volumes.needed="nvidia_driver"
LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
# NVIDIA container runtime.
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
# PyTorch (Geometric) installation
RUN rm /etc/apt/sources.list.d/cuda.list && \
rm /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
vim \
sudo \
git \
bzip2 \
libx11-6 \
&& rm -rf /var/lib/apt/lists/*
# Create a working directory.
RUN mkdir /app
WORKDIR /app
# Create a non-root user and switch to it.
RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
&& chown -R user:user /app
RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
USER user
# All users can use /home/user as their home directory.
ENV HOME=/home/user
RUN chmod 777 /home/user
# Install Miniconda.
RUN curl -so ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh \
&& chmod +x ~/miniconda.sh \
&& ~/miniconda.sh -b -p ~/miniconda \
&& rm ~/miniconda.sh
ENV PATH=/home/user/miniconda/bin:$PATH
ENV CONDA_AUTO_UPDATE_CONDA=false
# Create a Python 3.6 environment.
RUN /home/user/miniconda/bin/conda install conda-build \
&& /home/user/miniconda/bin/conda create -y --name py36 python=3.6.5 \
&& /home/user/miniconda/bin/conda clean -ya
ENV CONDA_DEFAULT_ENV=py36
ENV CONDA_PREFIX=/home/user/miniconda/envs/$CONDA_DEFAULT_ENV
ENV PATH=$CONDA_PREFIX/bin:$PATH
# CUDA 9.0-specific steps.
RUN conda install -y -c pytorch \
cuda90=1.0 \
magma-cuda90=2.4.0 \
"pytorch=1.1.0=py3.6_cuda9.0.176_cudnn7.5.1_0" \
torchvision=0.2.1 \
&& conda clean -ya
# Install HDF5 Python bindings.
RUN conda install -y h5py=2.8.0 \
&& conda clean -ya
RUN pip install h5py-cache==1.0
# Install TorchNet, a high-level framework for PyTorch.
RUN pip install torchnet==0.0.4
# Install Requests, a Python library for making HTTP requests.
RUN conda install -y requests=2.19.1 \
&& conda clean -ya
# Install Graphviz.
RUN conda install -y graphviz=2.38.0 \
&& conda clean -ya
RUN pip install graphviz==0.8.4
# Install OpenCV3 Python bindings.
RUN sudo apt-get update && sudo apt-get install -y --no-install-recommends \
libgtk2.0-0 \
libcanberra-gtk-module \
&& sudo rm -rf /var/lib/apt/lists/*
RUN conda install -y -c menpo opencv3=3.1.0 \
&& conda clean -ya
# Install PyTorch Geometric.
RUN CPATH=/usr/local/cuda/include:$CPATH \
&& LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
&& DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
RUN pip install --verbose --no-cache-dir torch-scatter \
&& pip install --verbose --no-cache-dir torch-sparse \
&& pip install --verbose --no-cache-dir torch-cluster \
&& pip install --verbose --no-cache-dir torch-spline-conv \
&& pip install torch-geometric
# Set the default command to python3.
CMD ["python3"]
我尝试从FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-runtime
开始,并注释直到# PyTorch (Geometric) installation
的所有内容以及# CUDA 9.0-specific steps.
上的部分为
RUN conda install -c pytorch pytorch
RUN conda install -c fragcolor cuda10.0 && conda clean -ya
并注释掉
# Install Graphviz.
RUN conda install -y graphviz=2.38.0 \
&& conda clean -ya
RUN pip install graphviz==0.8.4
即使在CUDA9.0下似乎也不起作用
这使得docker可以工作和加载,pytorch可以被导入,而cuda也可以正常工作。但是,当我尝试import torch_geometric
时,我会得到ModuleNotFoundError: No module named 'torch_scatter.scatter_cuda'
由于该软件包维护良好(在pytorch网站上提到的4.5k星),在我看来,这很可能是我的错,也是有关如何从CUDA9.0适应CUDA10.0的一些一般问题。
我很乐意提供任何关于我可能做错了什么或在不从原始Dockerfile中删除太多行的情况下进行更改的建议,这可能是导致问题的原因。
答案 0 :(得分:0)
您是否尝试过使用nvidia/cuda
Docker基础映像?
尝试
FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
RUN apt-get update && apt-get install -y --no-install-recommends \
apt-utils \
python3.6 \
python-dev \
python-pip \
python-setuptools \
&& \
rm -rf /var/lib/apt/lists/* && \
apt-get update
RUN pip install --upgrade pip==9.0.3 && \
pip --no-cache-dir install --upgrade torch==1.1.0 && \
pip --no-cache-dir install --upgrade torchvision==0.3.0
我编写的软件包的版本可以与Dockerfile一起稳定使用。
我检查了一下,效果很好,没有任何碰撞。