使用Gensim或其他Python LDA包来使用经过训练的Mallet LDA模型。

5
我有一个通过Java中的Mallet训练的LDA模型。从Mallet LDA模型生成了三个文件,这些文件允许我从文件运行模型并推断新文本的主题分布。
现在我想实现一个Python工具,可以根据训练好的LDA模型给出一个新文本的主题分布。我不想在Python中重新训练LDA模型。因此,我想知道是否可以将训练好的Mallet LDA模型加载到Gensim或其他任何Python LDA包中。如果可以,我该如何操作?
感谢任何答案或评论。
1个回答

1
简而言之,是的,你可以!使用Mallet的好处在于,一旦运行完成,你不必再去重新标记主题。我正在做类似的事情-我将在下面发布我的代码,并提供一些有用的链接。一旦您的模型训练完成,请保存笔记本小部件状态,您就可以在新的和不同的数据集上运行您的模型,同时保持相同的主题分配。此代码包括测试和验证集。确保您已下载Mallet和Java,然后尝试以下操作:

# future bridges python 2 and 3
from __future__ import print_function

# pandas works with data structures, data manipulation, and analysis specifically for numerical tables, and series like 
# the csv we are using here today
import pandas as pd

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

# Gensim unsupervised topic modeling, natural language processing, statistical machine learning
import gensim
# convert a document to a list of tolkens
from gensim.utils import simple_preprocess
# remove stopwords - words that are not telling: "it" "I" "the" "and" ect.
from gensim.parsing.preprocessing import STOPWORDS
# corpus iterator 
from gensim import corpora, models

# nltk - Natural Language Toolkit
# lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed 
# into present.
# stemmed — words are reduced to their root form.
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

# NumPy - multidimensional arrays, matrices, and high-level mathematical formulas
import numpy as np
np.random.seed(2018)

import os
from gensim.models.wrappers import LdaMallet
from pathlib import Path
import codecs
import logging

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

data = pd.read_csv('YourData.csv', encoding = "ISO-8859-1");
data_text = data[['Preprocessed Document or your comments column title']]
data_text['index'] = data_text.index
documents = data_text

# Create functions to lemmatize stem, and preprocess

# turn beautiful, beautifuly, beautified into stem beauti 
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# parse docs into individual words ignoring words that are less than 3 letters long
# and stopwords: him, her, them, for, there, ect since "their" is not a topic.
# then append the tolkens into a list
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        newStopWords = ['yourStopWord1', 'yourStopWord2']
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in newStopWords and len(token) > 3:
            nltk.bigrams(token)
            result.append(lemmatize_stemming(token))
    return result

# gensim.parsing.preprocessing.STOPWORDS

# look at a random row 4310 and see if things worked out
# note that the document created was already preprocessed

doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

# let’s look at ten rows passed through the lemmatize stemming and preprocess

documents = documents.dropna(subset=['Preprocessed Document'])
processed_docs = documents['Preprocessed Document'].map(preprocess)
processed_docs[:10]

# we create a dictionary of all the words in the csv by iterating through
# contains the number of times a word appears in the training set.

dictionary_valid = gensim.corpora.Dictionary(processed_docs[20000:])
count = 0
for k, v in dictionary_valid.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break
        
 # we create a dictionary of all the words in the csv by iterating through
# contains the number of times a word appears in the training set.

dictionary_test = gensim.corpora.Dictionary(processed_docs[:20000])
count = 0
for k, v in dictionary_test.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break
        
# we want to throw out words that are so frequent that they tell us little about the topic 
# as well as words that are too infrequent >15 rows then keep just 100,000 words

dictionary_valid.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# we want to throw out words that are so frequent that they tell us little about the topic 
# as well as words that are too infrequent >15 rows then keep just 100,000 words

dictionary_test.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# the words become numbers and are then counted for frequency
# consider a random row 4310 - it has 8 words word indexed 2 shows up once
# preview the bag of words

bow_corpus_valid = [dictionary_valid.doc2bow(doc) for doc in processed_docs]
bow_corpus_valid[4310]

# the words become numbers and are then counted for frequency
# consider a random row 4310 - it has 8 words word indexed 2 shows up once
# preview the bag of words

bow_corpus_test = [dictionary_test.doc2bow(doc) for doc in processed_docs]
bow_corpus_test[4310]

# same thing in more words

bow_doc_4310 = bow_corpus_test[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary_test[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

mallet_path = 'C:/mallet/mallet-2.0.8/bin/mallet.bat'

ldamallet_test = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bow_corpus_test, num_topics=20, id2word=dictionary_test)

result = (ldamallet_test.show_topics(num_topics=20, num_words=10,formatted=False))
for each in result:
    print (each)
    
mallet_path = 'C:/mallet/mallet-2.0.8/bin/mallet.bat'

ldamallet_valid = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bow_corpus_valid, num_topics=20, id2word=dictionary_valid)

result = (ldamallet_valid.show_topics(num_topics=20, num_words=10,formatted=False))
for each in result:
    print (each)
    
# Show Topics
for idx, topic in ldamallet_test.print_topics(-1):
   print('Topic: {} \nWords: {}'.format(idx, topic))
   
# Show Topics
for idx, topic in ldamallet_valid.print_topics(-1):
   print('Topic: {} \nWords: {}'.format(idx, topic))
   
# check out the topics - 30 words - 20 topics

ldamallet_valid.print_topics(idx, 30)

# check out the topics - 30 words - 20 topics

ldamallet_test.print_topics(idx, 30)

# Compute Coherence Score
coherence_model_ldamallet_valid = CoherenceModel(model=ldamallet_valid, texts=processed_docs, dictionary=dictionary_valid, coherence='c_v')
coherence_ldamallet_valid = coherence_model_ldamallet_valid.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet_valid)

# Compute Coherence Score
coherence_model_ldamallet_test = CoherenceModel(model=ldamallet_test, texts=processed_docs, dictionary=dictionary_test, coherence='c_v')
coherence_ldamallet_test = coherence_model_ldamallet_test.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet_test)

看看这个16:https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/ 这个也有用:https://rare-technologies.com/tutorial-on-mallet-in-python/ 还有这个:https://radimrehurek.com/gensim/models/wrappers/ldamallet.html

希望这有所帮助,祝你好运 :)


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接