corpus
。关于LDA的详细概述和解释可以在这里找到。from gensim.models import LdaModel, HdpModel
from gensim import corpora
corpus
中的单词映射到索引,然后使用它来创建一个词袋,其中corpus
中的标记被其索引替换。这是通过以下方式完成的:dirichlet_dict = corpora.Dictionary(corpus)
bow_corpus = [dirichlet_dict.doc2bow(text) for text in corpus]
对于LDA模型,需要确定最佳主题数量,可以通过这个答案中的方法启发式地完成。假设我们的最佳主题数量为10个,并且根据问题要求,我们需要300个关键词:
num_topics = 10
num_keywords = 300
dirichlet_model = LdaModel(corpus=bow_corpus,
id2word=dirichlet_dict,
num_topics=num_topics,
update_every=1,
chunksize=len(bow_corpus),
passes=20,
alpha='auto')
接下来是一个函数,根据话语连贯性的平均值来确定最佳话题。首先会生成每个话题最重要词汇的有序列表;然后找到每个话题与整个文集的平均连贯性;最后基于这个平均连贯性对话题进行排序,并返回连同后续使用的平均值列表一起。所有代码如下(包括以下使用HDP的选项):
def order_subset_by_coherence(dirichlet_model, bow_corpus, num_topics=10, num_keywords=10):
"""
Orders topics based on their average coherence across the corpus
Parameters
----------
dirichlet_model : gensim.models.type_of_model
bow_corpus : list of lists (contains (id, freq) tuples)
num_topics : int (default=10)
num_keywords : int (default=10)
Returns
-------
ordered_topics, ordered_topic_averages: list of lists and list
"""
if type(dirichlet_model) == gensim.models.ldamodel.LdaModel:
shown_topics = dirichlet_model.show_topics(num_topics=num_topics,
num_words=num_keywords,
formatted=False)
elif type(dirichlet_model) == gensim.models.hdpmodel.HdpModel:
shown_topics = dirichlet_model.show_topics(num_topics=150, # return all topics
num_words=num_keywords,
formatted=False)
model_topics = [[word[0] for word in topic[1]] for topic in shown_topics]
topic_corpus = dirichlet_model.__getitem__(bow=bow_corpus, eps=0) # cutoff probability to 0
topics_per_response = [response for response in topic_corpus]
flat_topic_coherences = [item for sublist in topics_per_response for item in sublist]
significant_topics = list(set([t_c[0] for t_c in flat_topic_coherences])) # those that appear
topic_averages = [sum([t_c[1] for t_c in flat_topic_coherences if t_c[0] == topic_num]) / len(bow_corpus) \
for topic_num in significant_topics]
topic_indexes_by_avg_coherence = [tup[0] for tup in sorted(enumerate(topic_averages), key=lambda i:i[1])[::-1]]
significant_topics_by_avg_coherence = [significant_topics[i] for i in topic_indexes_by_avg_coherence]
ordered_topics = [model_topics[i] for i in significant_topics_by_avg_coherence][:num_topics] # limit for HDP
ordered_topic_averages = [topic_averages[i] for i in topic_indexes_by_avg_coherence][:num_topics] # limit for HDP
ordered_topic_averages = [a/sum(ordered_topic_averages) for a in ordered_topic_averages] # normalize HDP values
return ordered_topics, ordered_topic_averages
ordered_topics, ordered_topic_averages = \
order_subset_by_coherence(dirichlet_model=dirichlet_model,
bow_corpus=bow_corpus,
num_topics=num_topics,
num_keywords=num_keywords)
keywords = []
for i in range(num_topics):
# Find the number of indexes to select, which can later be extended if the word has already been selected
selection_indexes = list(range(int(round(num_keywords * ordered_topic_averages[i]))))
if selection_indexes == [] and len(keywords) < num_keywords:
# Fix potential rounding error by giving this topic one selection
selection_indexes = [0]
for s_i in selection_indexes:
if ordered_topics[i][s_i] not in keywords and ordered_topics[i][s_i] not in ignore_words:
keywords.append(ordered_topics[i][s_i])
else:
selection_indexes.append(selection_indexes[-1] + 1)
# Fix for if too many were selected
keywords = keywords[:num_keywords]
ignore_words
,它是一个单词列表,不应包含在结果中。order_subset_by_coherence
进行排序和子集化,以确保最佳主题用于有限选择。可以通过以下方式创建模型:dirichlet_model = HdpModel(corpus=bow_corpus,
id2word=dirichlet_dict,
chunksize=len(bow_corpus))
import os
files = os.listdir()
topWords = ["word1", "word2.... etc"]
wordsCount = 0
for file in files:
file_opened = open(file, "r")
lines = file_opened.read().split("\n")
for word in topWords:
if word in lines and wordsCount < 301:
print("I found %s" %word)
wordsCount += 1
#Check Again wordsCount to close first repetitive instruction
if wordsCount == 300:
break
import os
import operator
from collections import defaultdict
files = os.listdir()
topWords = ["word1", "word2.... etc"]
wordsCount = 0
words = defaultdict(lambda: 0)
for file in files:
open_file = open(file, "r")
for line in open_file.readlines():
raw_words = line.split()
for word in raw_words:
words[word] += 1
sorted_words = sorted(words.items(), key=operator.itemgetter(1))
现在从排序后的单词中取前300个,它们就是你想要的单词。
应用tf-idf实现最重要单词的最简单有效方法。如果您有停用词,可以在应用此代码之前过滤掉停用词。希望这对您有所帮助。
import java.util.List;
/**
* Class to calculate TfIdf of term.
* @author Mubin Shrestha
*/
public class TfIdf {
/**
* Calculates the tf of term termToCheck
* @param totalterms : Array of all the words under processing document
* @param termToCheck : term of which tf is to be calculated.
* @return tf(term frequency) of term termToCheck
*/
public double tfCalculator(String[] totalterms, String termToCheck) {
double count = 0; //to count the overall occurrence of the term termToCheck
for (String s : totalterms) {
if (s.equalsIgnoreCase(termToCheck)) {
count++;
}
}
return count / totalterms.length;
}
/**
* Calculates idf of term termToCheck
* @param allTerms : all the terms of all the documents
* @param termToCheck
* @return idf(inverse document frequency) score
*/
public double idfCalculator(List allTerms, String termToCheck) {
double count = 0;
for (String[] ss : allTerms) {
for (String s : ss) {
if (s.equalsIgnoreCase(termToCheck)) {
count++;
break;
}
}
}
return 1 + Math.log(allTerms.size() / count);
}
}