Text As Data.
Trying out topic modeling, I had to just insert images due to the git file going over the 100mb limit.
tweets <- readRDS("~/networks-blog/boliviatweetsfile")
tweets <- iconv(tweets$text, to = "ASCII", sub = " ")
tweets <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets) # Remove the "RT" (retweet) and usernames
tweets = gsub("http.+ |http.+$", " ", tweets) # Remove html links
tweets = gsub("http[[:alnum:]]*", "", tweets)
tweets = gsub("[[:punct:]]", " ", tweets) # Remove punctuation
tweets = gsub("[ |\t]{2,}", " ", tweets) # Remove tabs
tweets = gsub("^ ", "", tweets) # Leading blanks
tweets = gsub(" $", "", tweets) # Lagging blanks
tweets = gsub(" +", " ", tweets) # General spaces
tweets = tolower(tweets)
tweets = unique(tweets)
corpus <- Corpus(VectorSource(tweets))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stemDocument)
corpus = tm_map(corpus, removeWords, c("amp", "will", "get", "can", "like", "say", "know"))
twtdfmstop <- tokens(as.character(corpus),
remove_punct = TRUE,
remove_url = TRUE,
remove_numbers = TRUE,
remove_separators = TRUE,
verbose = TRUE)
tweetdfm <- dfm(tokens(twtdfmstop))
tweetdfm <- tweetdfm %>%
dfm_remove("english") %>%
dfm_remove(c("bolivia", "coup"))
twtdfmstop2 <- dfm_remove(tweetdfm, c("can", "get", "know", "#", "@", "now", "like")) smaller_dfm <- dfm_trim(twtdfmstop2, min_termfreq = 100)
create fcm from dfm
smaller_fcm <- fcm(smaller_dfm)
myFeatures <- names(topfeatures(smaller_fcm, 30))
retain only those top features as part of our matrix
even_smaller_fcm <- fcm_select(smaller_fcm, pattern = myFeatures, selection = "keep")
check dimensions
dim(even_smaller_fcm)
compute size weight for vertices in network
size <- log(colSums(even_smaller_fcm))
create plot
set.seed(100) textplot_network(even_smaller_fcm, vertex_size = size / max(size) * 3)
lda <- LDA(tweetdfm, k = 10)
term <- terms(lda, 10) term