我正在尝试重新实现torch.nn.RNN
模块,而不使用C++/CUDA绑定,即使用简单的张量操作和相关逻辑。我已经开发了以下RNN类和相关测试逻辑,可用于与参考模块实例的输出进行比较:
import torch
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
super(RNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.w_ih = [torch.randn(hidden_size, input_size)]
if bidirectional:
self.w_ih_reverse = [torch.randn(hidden_size, input_size)]
for layer in range(num_layers - 1):
self.w_ih.append(torch.randn(hidden_size, hidden_size))
if bidirectional:
self.w_ih_reverse.append(torch.randn(hidden_size, hidden_size))
self.w_hh = torch.randn(num_layers, hidden_size, hidden_size)
if bidirectional:
self.w_hh_reverse = torch.randn(num_layers, hidden_size, hidden_size)
def forward(self, input, h_0=None):
if h_0 is None:
if self.bidirectional:
h_0 = torch.zeros(2, self.num_layers, input.shape[1], self.hidden_size)
else:
h_0 = torch.zeros(1, self.num_layers, input.shape[1], self.hidden_size)
if self.bidirectional:
output = torch.zeros(input.shape[0], input.shape[1], 2 * self.hidden_size)
else:
output = torch.zeros(input.shape[0], input.shape[1], self.hidden_size)
for t in range(input.shape[0]):
print(input.shape, t)
input_t = input[t]
if self.bidirectional:
input_t_reversed = input[-1 - t]
for layer in range(self.num_layers):
h_t = torch.tanh(torch.matmul(input_t, self.w_ih[layer].T) + torch.matmul(h_0[0][layer], self.w_hh[layer].T))
h_0[0][layer] = h_t
if self.bidirectional:
h_t_reverse = torch.tanh(torch.matmul(input_t_reversed, self.w_ih_reverse[layer].T) + torch.matmul(h_0[1][layer], self.w_hh_reverse[layer].T))
h_0[1][layer] = h_t_reverse
input_t = h_t
if self.bidirectional:
# This logic is incorrect for bidirectional RNNs with multiple layers
input_t = torch.cat((h_t, h_t_reverse), dim=-1)
input_t_reversed = input_t
output[t, :, :self.hidden_size] = h_t
if self.bidirectional:
output[-1 - t, :, self.hidden_size:] = h_t_reverse
return output
if __name__ == '__main__':
input_size = 10
hidden_size = 12
num_layers = 2
batch_size = 2
bidirectional = True
input = torch.randn(2, batch_size, input_size)
rnn_val = torch.nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bias=False, bidirectional=bidirectional, nonlinearity='tanh')
rnn = RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
for i in range(rnn_val.num_layers):
rnn.w_ih[i] = rnn_val._parameters['weight_ih_l%d' % i].data
rnn.w_hh[i] = rnn_val._parameters['weight_hh_l%d' % i].data
if bidirectional:
rnn.w_ih_reverse[i] = rnn_val._parameters['weight_ih_l%d_reverse' % i].data
rnn.w_hh_reverse[i] = rnn_val._parameters['weight_hh_l%d_reverse' % i].data
output_val, hn_val = rnn_val(input)
output = rnn(input)
print(output_val)
print(output)
我的实现似乎可以适用于任意层数和不同批次大小/序列长度的普通RNN,以及单层双向RNN,但是对于多层双向RNN,它无法产生正确的结果。
出于简单起见,偏置项尚未实现,并且仅支持tanh激活函数。我已经将逻辑错误缩小到这一行input_t = torch.cat((h_t, h_t_reverse), dim=-1)
,因为第一个输出序列是不正确的。
如果有人能够指点我正确的方向,并让我知道问题所在,我将不胜感激!