#Clustering
#Obama Facebook data
setwd("../Data")
obama <- read.csv("BarackObamaFacebook.csv")
dim(obama)
names(obama)
head(obama)
head(obama$message)

obama$message <- as.character(obama$message)

#Clustering with quanteda
install.packages("quanteda")
library(quanteda)
corpus <- corpus(obama$message, 
                 docvars=obama)
doc.features <- dfm(corpus)

#Clustering
doc.features <- dfm(corpus, 
                    stem=T, remove_punct=T)
doc.features <- dfm_remove(doc.features, 
                           stopwords("english"))
topfeatures(doc.features, 20)
#to get the same result next time, 
#we'll need to set a seed:
set.seed(01234)

#Standardizes the tdm.
#The number of starts will initialize
#the algorithm with however many random 
#starting points the user requests. 
#Then it displays the
#results with the lowest residual sum of squares.
kmeans.results.10 <- kmeans(dfm_weight(doc.features,"prop"),
                            centers = 10, 
                            nstart = 10) 
head(kmeans.results.10$cluster)

head(obama$message[kmeans.results.10$cluster==1])
head(obama$message[kmeans.results.10$cluster==2])

kmeans.results.10$withinss

#we can use this to find distinguishing words 
#between clusters
library(wordcloud)
cluster2 <- colSums(doc.features[kmeans.results.10$cluster==5,])
wordcloud(names(cluster2), cluster2, max.words=10)

#Structural Topic Models
#Check out structuraltopicmodel.com
library(devtools)
#install_github("bstewart/stm",dependencies=TRUE)

library(stm)
library(lda)
processed <- textProcessor(obama$message, metadata=obama)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
out$meta$photo <- ifelse(out$meta$type=="photo", 1, 0)

#You would want to increase max iterations!
stm.out <- stm(out$documents, out$vocab, K=20,
               prevalence=~photo,
               data=out$meta, max.em.its=5)
labelTopics(stm.out)

#Let's look at how the text of pics that include photos is different
prep <- estimateEffect(1:20~ photo,
                       stm.out, metadata=out$meta)
plot.estimateEffect(prep, "photo",
                    stm.out,
                    method="difference", cov.value1=1,
                    cov.value2=0)

findThoughts(stm.out, out$meta$message, 16)
#How about the way in which paul and obama speak differently?

#Obama Facebook data
obama <- read.csv("BarackObamaFacebook.csv")
dim(obama)
names(obama)
head(obama)
head(obama$message)

obama$message <- as.character(obama$message)

#Rand Paul Facebook data
paul <- read.csv("RandPaulFacebook.csv")
dim(paul)
names(paul)
head(paul)
head(paul$message)

paul$message <- as.character(paul$message)

obama$obama <- 1
paul$obama <- 0
merged <- rbind(obama, paul)
processed <- textProcessor(merged$message, metadata=merged)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
stm.out <- stm(out$documents, out$vocab, K=20,
               content=~obama,
               data=out$meta, max.em.its=5)
labelTopics(stm.out)
sageLabels(stm.out)
plot.STM(stm.out, "perspectives", topics=18)

#################
#Poliblogs data
##################
data1 <- read.csv("data1.csv")
processed <- textProcessor(data1$documents, metadata=data1)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)

#You would want to change max.em.its=5 to something bigger!
stm.out <- stm(out$documents, out$vocab, K=50,
               prevalence=~rating + s(day),
               data=out$meta, max.em.its=5)

labelTopics(stm.out)

plot.STM(stm.out, text.cex=.5, n=7)
cloud(stm.out, topic=43)

prep <- estimateEffect(1:50~ rating + s(day),
                       stm.out, metadata=out$meta)
plot.estimateEffect(prep, "day",
                    stm.out,
                    method="continuous", topics=12)

library(devtools)
install_github("mroberts/stmBrowser", dependencies=TRUE)
library(stmBrowser)
jan1 <- as.numeric(as.Date("2007-12-31"))
out$meta$date <- as.Date(jan1 + out$meta$day, origin="1970-01-01")
stmBrowser(stm.out, data=out$meta, c("rating", "date"),
           text="documents")
