目标:
- 使用用户输入的问题(如问答系统)识别类别标签。
- 从大型PDF文件中提取数据,并根据用户输入来预测页码。
- 主要用于政策文件,用户对政策有疑问并需要显示特定页面编号。
之前的实现: 应用了弹性搜索,但精度非常低,因为用户输入任何文本,如“我需要”=“想要”。
数据集信息: 数据集的每一行都包含文本(或段落)和标签(作为页面编号)。这里数据集很小,我只有500行。
当前实现:
- 在Keras中应用了带LSTM的单词嵌入(Glove),后端是TensorFlow
- 应用了Droupout
- 应用了ActivityRegularization
- 应用了L2 W_regularizer(从0.1到0.001)
- 应用了不同的nb_epoch,从10到600
- 将Glove数据的EMBEDDING_DIM从100更改为300
应用NLP进行:
- 转换为小写
- 删除英文停用词
- 词干提取
- 删除数字
- 删除URL和IP地址
结果:测试数据(或验证数据)的准确率为23%,但训练数据为91%。
代码:
import time
from time import strftime
import numpy as np
from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.layers import Dense, Input, LSTM, ActivityRegularization
from keras.layers import Embedding, Dropout,Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.utils import to_categorical
import pickle
from DataGenerator import *
BASE_DIR = ''
GLOVE_DIR = 'D:/Dataset/glove.6B' # BASE_DIR + '/glove.6B/'
MAX_SEQUENCE_LENGTH = 50
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
# first, build index mapping words in the embeddings set
# to their embedding vector
np.random.seed(1337) # for reproducibility
print('Indexing word vectors.')
t_start = time.time()
embeddings_index = {}
if os.path.exists('pickle/glove.pickle'):
print('Pickle found..')
with open('pickle/glove.pickle', 'rb') as handle:
embeddings_index = pickle.load(handle)
else:
print('Pickle not found...')
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding='utf8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
with open('pickle/glove.pickle', 'wb') as handle:
pickle.dump(embeddings_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Found %s word vectors.' % len(embeddings_index))
# second, prepare text samples and their labels
print('Processing text dataset')
texts = [] # list of text samples
labels = [] # list of label ids
labels_index = {} # dictionary mapping label name to numeric id
(texts, labels, labels_index) = get_data('D:/PolicyDocument/')
print('Found %s texts.' % len(texts))
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))
print('Preparing embedding matrix. :', embedding_matrix.shape)
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(embedding_matrix.shape[0],
embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
mask_zero=True,
trainable=False)
print('Training model.')
csv_file = "logs/training_log_" + strftime("%Y-%m-%d %H-%M", time.localtime()) + ".csv"
model_file = "models/Model_" + strftime("%Y-%m-%d %H-%M", time.localtime()) + ".mdl"
print("Model file:" + model_file)
csv_logger = CSVLogger(csv_file)
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
num_lstm = np.random.randint(175, 275)
rate_drop_dense = 0.15 + np.random.rand() * 0.25
x = LSTM(num_lstm, return_sequences=True, W_regularizer=l2(0.001))(embedded_sequences)
x = Dropout(0.5)(x)
x = LSTM(64)(x)
x = Dropout(0.25)(x)
x = ActivityRegularization(l1=0.01, l2=0.001)(x)
preds = Dense(len(labels_index), activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['acc'])
model_checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=0, save_best_only=True,
save_weights_only=False, mode='auto')
model.fit(x_train, y_train,
batch_size=1,
nb_epoch=600,
validation_data=(x_val, y_val), callbacks=[csv_logger, model_checkpoint])
score = model.evaluate(x_val, y_val, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
t_end = time.time()
total = t_end - t_start
ret_str = "Time needed(s): " + str(total)
print(ret_str)