宿主机 安装 driver
# 禁用宿主机 自带 显卡 驱动
lsmod | grep nouveau
nouveau 1949696 0
mxm_wmi 16384 1 nouveau
wmi 32768 2 mxm_wmi,nouveau
video 49152 1 nouveau
i2c_algo_bit 16384 1 nouveau
ttm 106496 2 qxl,nouveau
drm_kms_helper 184320 4 qxl,nouveau
drm 491520 5 drm_kms_helper,qxl,ttm,nouveau
# 首先金庸 nouvean
vi /etc/modprobe.d/blacklist.conf
# 在最后一行添加:blacklist nouveau
modprobe_path='/etc/modprobe.d/blacklist.conf'
sed -i "s/blacklist nouveau//g" ${modprobe_path}
echo -e '\nblacklist nouveau' >> ${modprobe_path}
sudo update-initramfs -u
# 关闭图形界面
systemctl set-default multi-user.target
reboot
lsmod | grep nouveau
# 无输出 代表成功
echo "nameserver 114.114.114.114" > /etc/resolv.conf
sudo sed -i "s@http.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list
sudo sed -i "s@http.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list
apt update -y
apt install nvidia-driver-460-server -y
# 关闭图形界面
systemctl set-default multi-user.target
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - \
&& curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update -y
sudo apt-get install -y nvidia-docker2
sudo systemctl restart docker
sudo systemctl enable docker
测试 nvidia-smi 驱动
sudo mkdir -p /etc/docker
sudo tee /etc/docker/daemon.json <<-'EOF'
{
"registry-mirrors": ["https://wm12hkla.mirror.aliyuncs.com"]
}
EOF
sudo systemctl daemon-reload
sudo systemctl restart docker
echo "nameserver 114.114.114.114" > /etc/resolv.conf
# 国内 加速
sudo docker run --rm --gpus all registry.cn-hangzhou.aliyuncs.com/mkmk/all:nvidia-cuda-11-base nvidia-smi
Thu Apr 8 16:52:50 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 GeForce GT 730 Off | 00000000:00:03.0 N/A | N/A |
| 30% 30C P0 N/A / N/A | 0MiB / 2002MiB | N/A Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 GeForce GT 730 Off | 00000000:00:04.0 N/A | N/A |
| 30% 27C P0 N/A / N/A | 0MiB / 2002MiB | N/A Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
root@free_cicd:~#
# 关闭图形界面
systemctl set-default multi-user.target
使用 gpu-burn 测试 gpu
# 前台运行
docker run -it --gpus=all --name gpu-jupyter1 -p 8888:8888 ${register_url}/tensorflow/tensorflow:2.4.1-gpu-jupyter
# 删除
docker stop gpu-jupyter1 && docker rm gpu-jupyter1
# 后台运行 , 后台 运行 需要 查看 密码
docker run -d --gpus=all --name gpu-jupyter1 -p 8888:8888 ${register_url}/tensorflow/tensorflow:2.4.1-gpu-jupyter
docker logs gpu-jupyter1
ip:8888 访问即可
gub-burn 的 dockerfile
docker run -it --gpus=all --name gpu-jupyter1 -p 8888:8888 ${register_url}/tensorflow/tensorflow:2.4.1-gpu-jupyter
# 推送到 远程
register_url='192.168.170.100:5000'
docker tag ${register_url}/tensorflow/tensorflow:2.4.1-gpu-jupyter registry.cn-hangzhou.aliyuncs.com/mkmk/all:tensorflow-2.4.1-gpu-jupyter
docker push registry.cn-hangzhou.aliyuncs.com/mkmk/all:tensorflow-2.4.1-gpu-jupyter
赠送的 一些 tf 简单代码
# 测试 是否 使用的 gpu
import tensorflow as tf
tf.test.is_gpu_available(
cuda_only=False,
min_cuda_compute_capability=None
)
print("is_gpu: ", tf.test.is_gpu_available())
# 查看 所有的 可用 计算 设备
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
# 加法
# cpu
%%time
with tf.device("/device:CPU:0"):
a=tf.zeros([1000,1000])
print("a on gpu:",a.device.endswith('GPU:0'))
for i in range(10000):
b=tf.add(a,a)
-->
a on gpu: False
CPU times: user 7.74 s, sys: 1.2 s, total: 8.94 s
Wall time: 3.39 s
# gpu
%%time
with tf.device("/device:GPU:0"):
a=tf.zeros([1000,1000])
print("a on gpu:",a.device.endswith('GPU:0'))
for i in range(10000):
b=tf.add(a,a)
-->
a on gpu: True
CPU times: user 900 ms, sys: 1.22 s, total: 2.12 s
Wall time: 2.12 s
讲解: 真正的 计算时间 是 是 用户态 计算时间, 7s 不是 物理时间 而是 cpu 逻辑 耗时
CPU times: user 7.74 s, (cpu 耗时)
CPU times: user 900 ms, (GPU 耗时)
来聊聊天啊
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。