library(mallet)
library(wordcloud)
# Insert path to the directory where this script is kept.
setwd("~/Desktop/topic_model")
n.topics <- 20
#Insert path to the modified_texts directory
documents <- mallet.read.dir("~/Desktop/topic_model/modified_texts")
mallet.instances <- mallet.import(documents$id, documents$text, "stopwords.txt",
token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
# Create a topic trainer object.
topic.model <- MalletLDA(num.topics=n.topics)
topic.model$loadDocuments(mallet.instances)
topic.model$setAlphaOptimization(20, 50)
# Train a model.
# We can specify the number of iterations. Here we'll use a large-ish round number.
topic.model$train(1000)
# Run through a few iterations where we pick the best topic for each token,
# rather than sampling from the posterior distribution.
topic.model$maximize(50)
# Basic variables
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
topic.labels <- mallet.topic.labels(topic.model, topic.words, 3)
mallet.word.freqs <- mallet.word.freqs(topic.model)
# Doc id vector
doc.ids <- gsub("~/Desktop/topic_model/modified_texts/", "", documents$id)
doc.ids <- gsub(".txt", "", doc.ids)
# Function identifies the 50 articles that are most representative of a given topic
top.docs <- function(x, num){
df <- data.frame(x, topic.labels[x], round(100*(doc.topics[, x]), digits=2), doc.ids)
docs <- df[order(df[3], decreasing=TRUE),]
head(docs, num)
}
# Create file containing a table of all the top articles for each topic.
topic.data <- data.frame()
for (x in c(1:20)){
topic.data <- rbind(topic.data, top.docs(x, 50))
}
colnames(topic.data) <- c("topic number", "top three words", "percentage", "document")
write.csv(topic.data, file="top_docs.csv")
# Make wordcloud PNG files
for (x in c(1:20)){
top.words <- mallet.top.words(topic.model, topic.words[x,], 50)
png(paste("topic_", x, ".png", sep=""), width=1200, height=1200, units='px', res=300 )
wordcloud(top.words[,1], top.words[,2], scale=c(3, .5), max.words=Inf, min.freq=3, random.order=F, rot.per=0, use.r.layout=FALSE, fixed.asp=TRUE)
dev.off()
}
