LSTM实现 / 过拟合

4
我在LSTM的实现中遇到了问题,我不确定我的实现是否正确,还是只是过拟合的问题。我正在使用LSTM对文章打分,使用0-10(或其他范围)的分数来评估文本。我使用ASAP kaggle竞赛数据作为其中一个训练数据。
然而,主要目标是在包含大约500个样本的私有数据集上实现良好的性能。这500个样本包括验证和训练集。我之前做了一些实验,并使模型工作正常,但在调整某些内容后,模型不再适用。模型没有任何改进。我还以更加面向对象的方式重新实现了代码,但仍无法重现先前的结果。
然而,我可以让模型拟合我的数据,只是出现了严重的过拟合。我不确定这是某种实现问题还是过拟合问题,但我无法让模型工作。我在ASAP数据文章集1上使用LSTM最多只能获得0.35 kappa的性能。出乎意料的是,我可以让单层完全连接的模型具有0.75 kappa。我认为这是一个实现问题,但我不确定。
以下是我的旧代码:

train.py

import gensim
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import cohen_kappa_score
from torch import nn
import torch.utils.data as data_utils
from torch.optim import Adam

from dataset import AESDataset
from network import Network

from optimizer import Ranger
from qwk import quadratic_weighted_kappa, kappa

batch_size = 32

device = "cuda:0"
torch.manual_seed(1000)
# Load data from csv
file_name = "data/data_new.csv"
data = pd.read_csv(file_name)
arr = data.to_numpy()
text = arr[:, :2]
text = [str(line[0]) + str(line[1]) for line in text]
text = [gensim.utils.simple_preprocess(line) for line in text]

score = arr[:,2]

score = [sco*6 for sco in score]
score = np.asarray(score, dtype=int)


train_dataset = AESDataset(text_arr=text[:400], scores=score[:400])
test_dataset = AESDataset(text_arr=text[400:], scores=score[400:])

score = torch.tensor(score).view(-1,1).long().to(device)


train_loader = data_utils.DataLoader(train_dataset,shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = data_utils.DataLoader(test_dataset,shuffle=True,batch_size=batch_size, drop_last=True)

out_class = 61

epochs = 1000

model = Network(out_class).to(device)
model.load_state_dict(torch.load("model/best_model"))
y_onehot = torch.FloatTensor(batch_size, out_class).to(device)
optimizer = Adam(model.parameters())
criti = torch.nn.CrossEntropyLoss()
# model, optimizer = amp.initialize(model, optimizer, opt_level="O2")


step = 0

for i in range(epochs):
    #Testing
    if i % 1 == 0:
        total_loss = 0
        total_kappa = 0
        total_batches = 0
        model.eval()
        for (text, score) in test_loader:

            out = model(text)
            out_score = torch.argmax(out, 1)
            y_onehot.zero_()
            y_onehot.scatter_(1, score, 1)
            kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
            score = score.view(-1)
            loss = criti(out, score.view(-1))
            total_loss += loss
            total_kappa += kappa_l
            total_batches += 1
        print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
        with open(f"model/epoch_{i}", "wb") as f:
            torch.save(model.state_dict(),f)
        model.train()
    #Training

    for (text, score) in train_loader:

        optimizer.zero_grad()
        step += 1
        out = model(text)
        out_score = torch.argmax(out,1)
        y_onehot.zero_()
        y_onehot.scatter_(1, score, 1)
        kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
        loss = criti(out, score.view(-1))
        print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
        loss.backward()
        optimizer.step()

dataset.py

import gensim
import torch
import numpy as np

class AESDataset(torch.utils.data.Dataset):
    def __init__(self, text_arr, scores):
        self.data = text_arr
        self.scores = scores
        self.w2v_model = ("w2vec_model_all")
        self.max_len = 500
    def __getitem__(self, item):
        vector = []
        essay = self.data[item]

        pad_vec = [1 for i in range(300)]
        for i in range(self.max_len - len(essay)):
            vector.append(pad_vec)
        for word in essay:
            word_vec = pad_vec
            try:
                word_vec = self.w2v_model[word]
            except:
                #print(f"Skipping word as word {word} not in dictionary")
                word_vec = pad_vec


            vector.append(word_vec)
        #print(len(vector))
        vector = np.stack(vector)
        tensor = torch.tensor(vector[:self.max_len]).float().to("cuda")
        score = self.scores[item]
        score = torch.tensor(score).long().to("cuda").view(1)

        return tensor, score

    def __len__(self):
        return len(self.scores)

network.py

import torch.nn as nn
import torch

import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, output_size):
        super(Network, self).__init__()
        self.lstm = nn.LSTM(300,500,1, batch_first=True)
        self.dropout = nn.Dropout(p=0.5)
        #self.l2 = nn.L2
        self.linear = nn.Linear(500,output_size)





    def forward(self,x):
        x, _ = self.lstm(x)
        x = x[:,-1,:]
        x = self.dropout(x)
        x = self.linear(x)


        return x

我的新代码:https://github.com/Clement-Hui/EssayGrading

1个回答

2

我认为问题在于训练代码,因为你正在使用LSTM,所以应该在每个epoch结束后清除隐藏状态和单元状态,并在每个batch结束后将其从计算图中分离。

network.py

import torch.nn as nn
import torch

import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, output_size):
        super(Network, self).__init__()
        self.lstm = nn.LSTM(300,500,1, batch_first=True)
        self.dropout = nn.Dropout(p=0.5)
        #self.l2 = nn.L2
        self.linear = nn.Linear(500,output_size)

    def forward(self,x,hidden):
        x, hidden = self.lstm(x,hidden)
        x = x.contiguous().view(-1, 500)
        x = self.dropout(x)
        x = self.linear(x)
        return x , hidden

    def init_hidden(self,batch_size):
        weights = next(self.parameters()).data
        hidden = (weights.new(1 , batch_size,500).zero_().cuda(),
                  weights.new(1 , batch_size,500).zero_().cuda())
        return hidden

train.py

# your code for intializing the model and data and all other stuff
for i in range(epochs):

    #Testing
    if i % 1 == 0:
        total_loss = 0
        total_kappa = 0
        total_batches = 0
        model.eval()
        val_h  = model.init_hidden(batch_size) # intialize the hidden state
        for (text, score) in test_loader:
           # Creating new variables for the hidden state, otherwise
           # we'd backprop through the entire training history
            val_h = tuple([each.data for each in val_h]) 
            out ,  val_h  = model(text,val_h)
            out_score = torch.argmax(out, 1)
            y_onehot.zero_()
            y_onehot.scatter_(1, score, 1)
            kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
            score = score.view(-1)
            loss = criti(out, score.view(-1))
            total_loss += loss
            total_kappa += kappa_l
            total_batches += 1
        print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
        with open(f"model/epoch_{i}", "wb") as f:
            torch.save(model.state_dict(),f)
    model.train()

    #Training
    h =  model.init_hidden(batch_size) # intialize the hidden state
    for (text, score) in train_loader:
        optimizer.zero_grad()
        step += 1
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        out , h  = model(text,h)
        out_score = torch.argmax(out,1)
        y_onehot.zero_()
        y_onehot.scatter_(1, score, 1)
        kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
        loss = criti(out, score.view(-1))
        print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
        loss.backward()
        optimizer.step()

请告诉我所提到的更改是否有效。

嗨。但是LSTM默认不会使用0来初始化模型吗?谢谢。 - Clement
我认为我的问题是由于数据不足引起的。你能在这里回答吗?谢谢:https://ai.stackexchange.com/questions/16520/how-to-train-lstm-score-prediction-with-very-little-data-bounty-to-be-added - Clement
是的,LSTM默认使用零来初始化隐藏状态和单元格状态,并在每个纪元后清除自身,但我认为问题在于它没有在每个批次迭代中从计算图中分离出来。 请参考此线程https://discuss.pytorch.org/t/when-to-initialize-lstm-hidden-state/2323 - aman5319

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接