# our data
documents = [u'Human machine interface for lab abc computer applications',
        u'A survey of user opinion of computer system response time',
        u'The EPS user interface management system',
        u'System and human system engineering testing of EPS',
        u'Relation of user perceived response time to error measurement',
        u'The generation of random binary unordered trees',
        u'The intersection graph of paths in trees',
        u'Graph minors IV Widths of trees and well quasi ordering',
        u'Graph minors A survey']

from sklearn.feature_extraction.text import CountVectorizer
# compute vector space with sklearn
vect = CountVectorizer(min_df=1, ngram_range=(1, 1), max_features=25000)
corpus_vect = vect.fit_transform(documents)
# each doc is a scipy sparse matrix
print vect.vocabulary_
#{u'and': 1, u'minors': 20, u'generation': 9, u'testing': 32, u'iv': 15, u'engineering': 5, u'computer': 4, u'relation': 28, u'human': 11, u'measurement': 19, u'unordered': 37, u'binary': 3, u'abc': 0, u'for': 8, u'ordering': 23, u'graph': 10, u'system': 31, u'machine': 17, u'to': 35, u'quasi': 26, u'time': 34, u'random': 27, u'paths': 24, u'of': 21, u'trees': 36, u'applications': 2, u'management': 18, u'lab': 16, u'interface': 13, u'intersection': 14, u'response': 29, u'perceived': 25, u'in': 12, u'widths': 40, u'well': 39, u'eps': 6, u'survey': 30, u'error': 7, u'opinion': 22, u'the': 33, u'user': 38}

import gensim
# transform sparse matrix into gensim corpus
corpus_vect_gensim = gensim.matutils.Sparse2Corpus(corpus_vect, documents_columns=False)
lsi = gensim.models.LsiModel(corpus_vect_gensim, num_topics=4)
# I instead would like something like this line below
# lsi = gensim.models.LsiModel(corpus_vect_gensim, id2word=vect.vocabulary_, num_topics=2)
print lsi.print_topics(2)
#['0.622*"21" + 0.359*"31" + 0.256*"38" + 0.206*"29" + 0.206*"34" + 0.197*"36" + 0.170*"33" + 0.168*"1" + 0.158*"10" + 0.147*"4"', '0.399*"36" + 0.364*"10" + -0.295*"31" + 0.245*"20" + -0.226*"38" + 0.194*"26" + 0.194*"15" + 0.194*"39" + 0.194*"23" + 0.194*"40"']


Gensim不需要Dictionary对象。 只要你的普通dict将id(整数)映射到单词(字符串),就可以直接将其用作id2word的输入。

实际上,任何类似字典的东西都可以使用(包括dictDictionarySqliteDict ...)。

(顺便说一句,gensim的Dictionary在底层是一个简单的Python dict。不确定您关于Dictionary性能的评论来自何处,但在Python中您无法比普通的dict更快地获得映射。也许您将其与文本预处理(不是gensim的一部分)混淆,这确实可能很慢。)

太快了,太容易了,谢谢!我倾向于gensim的对象感到害怕...没想到我只需要格式化scikit的dict。我真是太丢人了...关于时间/性能评论,使用1k个文档在gensim中创建字典大约需要0.9秒,再加上一整秒将其转换为BoW和Tfidf。相比之下,scikit-learn的TfidfVectorizer只需1.2秒完成整个工作。 - emiguevara

# transform sparse matrix into gensim corpus
corpus_vect_gensim = gensim.matutils.Sparse2Corpus(corpus_vect, documents_columns=False)

# transform scikit vocabulary into gensim dictionary
vocabulary_gensim = {}
for key, val in vect.vocabulary_.items():
    vocabulary_gensim[val] = key

或者简单地说,id2word = dict((v, k) for k, v in vect.vocabulary_.iteritems()) - Radim



from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary.from_corpus(corpus_vect_gensim,
                                    id2word=dict((id, word) for word, id in vect.vocabulary_.items()))


我用你的“dictionary”替换了上一个答案中的“vocabulary_gensim”。现在用gensim.CoherenceModel()的c_v方法计算一致性是可行的。谢谢! - Tolga
很高兴能帮助到你 (: - Jeffrey04


下面是能够工作的Python 3代码解决方案。

import gensim
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import CountVectorizer

def vect2gensim(vectorizer, dtmatrix):
     # transform sparse matrix into gensim corpus and dictionary
    corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix, documents_columns=False)
    dictionary = Dictionary.from_corpus(corpus_vect_gensim,
        id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))

    return (corpus_vect_gensim, dictionary)

documents = [u'Human machine interface for lab abc computer applications',
        u'A survey of user opinion of computer system response time',
        u'The EPS user interface management system',
        u'System and human system engineering testing of EPS',
        u'Relation of user perceived response time to error measurement',
        u'The generation of random binary unordered trees',
        u'The intersection graph of paths in trees',
        u'Graph minors IV Widths of trees and well quasi ordering',
        u'Graph minors A survey']

# compute vector space with sklearn
vect = CountVectorizer(min_df=1, ngram_range=(1, 1), max_features=25000)
corpus_vect = vect.fit_transform(documents)

# transport to gensim
(gensim_corpus, gensim_dict) = vect2gensim(vect, corpus_vect)



直接使用vect.vocabulary_(键和值互换)在Python 3上不起作用,因为dict.keys()现在返回一个可迭代视图而不是列表。 相关的错误是:

TypeError: can only concatenate list (not "dict_keys") to list

要在Python 3上使其工作,请将lsimodel.py中的第301行更改为:

self.num_terms = 1 + max([-1] + list(self.id2word.keys()))




唯一的区别是使用Scikit Tokenizer和Stopwords

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import gensim

from gensim import models

print("Text Similarity with Gensim and Scikit utils")
# compute vector space with sklearn
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",

# Using Scikit learn feature extractor

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(min_df=1, ngram_range=(1, 1), stop_words='english')
corpus_vect = vect.fit_transform(documents)
# take the dict keys out
texts = list(vect.vocabulary_.keys())

from gensim import corpora
dictionary = corpora.Dictionary([texts])

# transform scikit vocabulary into gensim dictionary
corpus_vect_gensim = gensim.matutils.Sparse2Corpus(corpus_vect, documents_columns=False)

# create LSI model
lsi = models.LsiModel(corpus_vect_gensim, id2word=dictionary, num_topics=2)

# convert the query to LSI space
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  

# Find similarities
from gensim import similarities
index = similarities.MatrixSimilarity(lsi[corpus_vect_gensim])  # transform corpus to LSI space and index it

sims = index[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

sims = sorted(enumerate(sims), key=lambda item: -item[1])
for doc_position, doc_score in sims:
    print(doc_score, documents[doc_position])

