从https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb的教程中可以了解到,有一个USE_CUDA
标志,用于控制变量和张量类型在CPU(当为False时)和GPU(当为True时)之间的切换。
使用en-fr.tsv中的数据,并将句子转换为变量:
import unicodedata
import string
import re
import random
import time
import math
from gensim.corpora.dictionary import Dictionary
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import LongTensor, FloatTensor
from torch import optim
import torch.nn.functional as F
import numpy as np
MAX_LENGTH = 10
USE_CUDA = False
# Turn a Unicode string to plain ASCII, thanks to https://dev59.com/8HRB5IYBdhLWcg3wxZ7Y#518232
def unicode_to_ascii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
s = unicode_to_ascii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
return s
SOS_IDX, SOS_TOKEN = 0, '<s>'
EOS_IDX, EOS_TOKEN = 1, '</s>'
UNK_IDX, UNK_TOKEN = 2, '<unk>'
PAD_IDX, PAD_TOKEN = 3, '<blank>'
lines = open('en-fr.tsv').read().strip().split('\n')
pairs = [[normalize_string(s).split() for s in l.split('\t')] for l in lines]
src_sents, trg_sents = zip(*pairs)
src_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
src_dict.add_documents(src_sents)
trg_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
trg_dict.add_documents(trg_sents)
def variablize_sentences(sentence, dictionary):
indices = [dictionary.token2id[tok] for tok in sentence] + [dictionary.token2id[EOS_TOKEN]]
var = Variable(LongTensor(indices).view(-1, 1))
return var.cuda() if USE_CUDA else var
input_variables = [variablize_sentences(sent, src_dict) for sent in src_sents]
output_variables = [variablize_sentences(sent, trg_dict) for sent in trg_sents]
通过使用编码器-注意力-解码器网络:
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
self.gru = self.gru.cuda() if USE_CUDA else self.gru
def forward(self, word_inputs, hidden):
seq_len = len(word_inputs)
embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
embedded = embedded.cuda() if USE_CUDA else embedded
output, hidden = self.gru(embedded, hidden)
output = output.cuda() if USE_CUDA else output
hiddne = hidden.cuda() if USE_CUDA else hidden
return output, hidden
def init_hidden(self):
hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
return hidden.cuda() if USE_CUDA else hidden
class Attn(nn.Module):
def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.other = nn.Parameter(FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs):
seq_len = len(encoder_outputs)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
attn_energies = attn_energies.cuda() if USE_CUDA else attn_energies
# Calculate energies for each encoder output
for i in range(seq_len):
attn_energies[i] = self.score(hidden, encoder_outputs[i])
# Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)
def score(self, hidden, encoder_output):
if self.method == 'dot':
energy =torch.dot(hidden.view(-1), encoder_output.view(-1))
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = torch.dot(hidden.view(-1), energy.view(-1))
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = torch.dot(self.v.view(-1), energy.view(-1))
return energy
class AttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
# Keep parameters for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout_p = dropout_p
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
self.out = nn.Linear(hidden_size * 2, output_size)
self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
self.gru = self.gru.cuda() if USE_CUDA else self.gru
self.out = self.out.cuda() if USE_CUDA else self.out
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
self.attn = self.attn.cuda() if USE_CUDA else self.attn
def forward(self, word_input, last_context, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
# Combine embedded input word and last context, run through RNN
rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
rnn_output, hidden = self.gru(rnn_input, last_hidden)
# Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
# Final output layer (next word prediction) using the RNN hidden state and context vector
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
if USE_CUDA:
return output.cuda(), context.cuda(), hidden.cuda(), attn_weights.cuda()
else:
return output, context, hidden, attn_weights
测试网络:
encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L
encoder_hidden = encoder_test.init_hidden()
if USE_CUDA:
word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)
代码在CPU上可以正常运行。[out]:
EncoderRNN (
(embedding): Embedding(10, 10)
(gru): GRU(10, 10, num_layers=2)
)
AttnDecoderRNN (
(embedding): Embedding(10, 10)
(gru): GRU(20, 10, num_layers=2, dropout=0.1)
(out): Linear (20 -> 10)
(attn): Attn (
(attn): Linear (10 -> 10)
)
)
Variable containing:
-2.4378 -2.3556 -2.3391 -2.5070 -2.3439 -2.3415 -2.3976 -2.1832 -1.9976 -2.2213
[torch.FloatTensor of size 1x10]
Variable containing:
(0 ,.,.) =
Columns 0 to 8
-0.2325 0.0775 0.5415 0.4876 -0.5771 -0.0687 0.1832 -0.5285 0.2508
Columns 9 to 9
-0.1837
(1 ,.,.) =
Columns 0 to 8
-0.1389 -0.2605 -0.0518 0.3405 0.0774 0.1815 0.0297 -0.1304 -0.1015
Columns 9 to 9
0.2602
[torch.FloatTensor of size 2x1x10]
Variable containing:
(0 ,.,.) =
0.3334 0.3291 0.3374
[torch.FloatTensor of size 1x1x3]
但当将标志更改为USE_GPU=True
时,初始化decoder_test
对象时会抛出错误,会抛出TypeError
:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-76-b3c660013934> in <module>()
12 decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
13
---> 14 decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
15 print(decoder_output)
16 print(decoder_hidden)
~/.local/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
222 for hook in self._forward_pre_hooks.values():
223 hook(self, input)
--> 224 result = self.forward(*input, **kwargs)
225 for hook in self._forward_hooks.values():
226 hook_result = hook(self, input, result)
<ipython-input-75-34ecfe9b3112> in forward(self, word_input, last_context, last_hidden, encoder_outputs)
32
33 # Combine embedded input word and last context, run through RNN
---> 34 rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
35 rnn_output, hidden = self.gru(rnn_input, last_hidden)
36
~/.local/lib/python3.5/site-packages/torch/autograd/variable.py in cat(iterable, dim)
895 @staticmethod
896 def cat(iterable, dim=0):
--> 897 return Concat.apply(dim, *iterable)
898
899 @staticmethod
~/.local/lib/python3.5/site-packages/torch/autograd/_functions/tensor.py in forward(ctx, dim, *inputs)
315 ctx.dim = dim
316 ctx.input_sizes = [i.size(dim) for i in inputs]
--> 317 return torch.cat(inputs, dim)
318
319 @staticmethod
TypeError: cat received an invalid combination of arguments - got (tuple, int), but expected one of:
* (sequence[torch.cuda.FloatTensor] seq)
* (sequence[torch.cuda.FloatTensor] seq, int dim)
didn't match because some of the arguments have invalid types: (tuple, int)
问题是为什么在CUDA中这些类型不匹配,但在CPU上可以工作,并且如何解决这个问题?
PyTorch是否有一个全局标志,可以将所有类型更改为CUDA类型,而不会干扰CPU/GPU类型?
encoder_hidden
和decoder_context
没有.cuda()
。 - alvas