我正在使用topicmodels包中的LDA,并对约30,000个文件运行它,得到了30个主题,并获得了每个主题的前10个单词,它们看起来非常好。但我想知道哪些文档属于具有最高概率的哪个主题,我该如何做?
myCorpus <- Corpus(VectorSource(userbios$bio))
docs <- userbios$twitter_id
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
myCorpus <- tm_map(myCorpus, removeURL)
myStopwords <- c("twitter", "tweets", "tweet", "tweeting", "account")
# remove stopwords from corpus
myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
# stem words
# require(rJava) # needed for stemming function
# library(Snowball) # also needed for stemming function
# a <- tm_map(myCorpus, stemDocument, language = "english")
myDtm <- DocumentTermMatrix(myCorpus, control = list(wordLengths=c(2,Inf), weighting=weightTf))
myDtm2 <- removeSparseTerms(myDtm, sparse=0.85)
dtm <- myDtm2
library(topicmodels)
rowTotals <- apply(dtm, 1, sum)
dtm2 <- dtm[rowTotals>0]
dim(dtm2)
dtm_LDA <- LDA(dtm2, 30)