在PyTorch中,我编写了一个非常简单的CNN鉴别器并进行了训练。现在我需要将其部署以进行预测。但是目标机器的GPU内存很小,出现了内存溢出错误。因此,我认为可以设置
我的模型中大约有500万个参数。但是当预测单个输入批次时,它会消耗约1.2GB的内存。我认为应该没有必要使用这么多的内存。
问题是如何在只想使用我的模型进行预测时节省GPU内存使用?
以下是演示,我使用
通过将
当我取消注释该行时,我得到了:
requires_grad = False
来防止PyTorch存储梯度值。然而,我没有发现这种方式有任何区别。我的模型中大约有500万个参数。但是当预测单个输入批次时,它会消耗约1.2GB的内存。我认为应该没有必要使用这么多的内存。
问题是如何在只想使用我的模型进行预测时节省GPU内存使用?
以下是演示,我使用
discriminator.requires_grad_
来禁用/启用所有参数的自动求导,但似乎没有用。import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as functional
from pynvml.smi import nvidia_smi
nvsmi = nvidia_smi.getInstance()
def getMemoryUsage():
usage = nvsmi.DeviceQuery("memory.used")["gpu"][0]["fb_memory_usage"]
return "%d %s" % (usage["used"], usage["unit"])
print("Before GPU Memory: %s" % getMemoryUsage())
class Discriminator(nn.Module):
def __init__(self):
super().__init__()
# trainable layers
# input: 2x256x256
self.conv1 = nn.Conv2d(2, 8, 5, padding=2) # 8x256x256
self.pool1 = nn.MaxPool2d(2) # 8x128x128
self.conv2 = nn.Conv2d(8, 32, 5, padding=2) # 32x128x128
self.pool2 = nn.MaxPool2d(2) # 32x64x64
self.conv3 = nn.Conv2d(32, 96, 5, padding=2) # 96x64x64
self.pool3 = nn.MaxPool2d(4) # 96x16x16
self.conv4 = nn.Conv2d(96, 256, 5, padding=2) # 256x16x16
self.pool4 = nn.MaxPool2d(4) # 256x4x4
self.num_flat_features = 4096
self.fc1 = nn.Linear(4096, 1024)
self.fc2 = nn.Linear(1024, 256)
self.fc3 = nn.Linear(256, 1)
# loss function
self.loss = nn.MSELoss()
# other properties
self.requires_grad = True
def forward(self, x):
y = x
y = self.conv1(y)
y = self.pool1(y)
y = functional.relu(y)
y = self.conv2(y)
y = self.pool2(y)
y = functional.relu(y)
y = self.conv3(y)
y = self.pool3(y)
y = functional.relu(y)
y = self.conv4(y)
y = self.pool4(y)
y = functional.relu(y)
y = y.view((-1,self.num_flat_features))
y = self.fc1(y)
y = functional.relu(y)
y = self.fc2(y)
y = functional.relu(y)
y = self.fc3(y)
y = torch.sigmoid(y)
return y
def predict(self, x, score_th=0.5):
if len(x.shape) == 3:
singlebatch = True
x = x.view([1]+list(x.shape))
else:
singlebatch = False
y = self.forward(x)
label = (y > float(score_th))
if singlebatch:
y = y.view(list(y.shape)[1:])
return label, y
def requires_grad_(self, requires_grad=True):
for parameter in self.parameters():
parameter.requires_grad_(requires_grad)
self.requires_grad = requires_grad
x = torch.cuda.FloatTensor(np.zeros([2, 256, 256]))
discriminator = Discriminator()
discriminator.to("cuda:0")
# comment/uncomment this line to make difference
discriminator.requires_grad_(False)
discriminator.predict(x)
print("Requires grad", discriminator.requires_grad)
print("After GPU Memory: %s" % getMemoryUsage())
通过将
discriminator.requires_grad_(False)
这一行注释掉,我得到了输出:Before GPU Memory: 6350MiB
Requires grad True
After GPU Memory: 7547MiB
当我取消注释该行时,我得到了:
Before GPU Memory: 6350MiB
Requires grad False
After GPU Memory: 7543MiB