Twitter Data

This example scrapes Twitter data, visualizes it, and looks at some descriptive information:

First, we install the rtweet package:

 install.packages("rtweet")
 library(ggplot2)
 library(rtweet)
 library(igraph)
 library(tidyverse)
 library(ggraph)

Second, we create the Twitter token:

 token <- rtweet::create_token(
 app = "APPNAME",
 consumer_key <- "YOURKEY",
 consumer_secret <- "YOURSECRETKEY",
 access_token <- "...",
 access_secret <- "...")

Obs: You need a Twitter developer account for this.

We collect tweets for a specific subject or user, in this case we will collect tweets that mention just the names Biden and Trump:

 biden <- rtweet::search_tweets("Biden", n = 5000, include_rts = FALSE)

 trump <- rtweet::search_tweets("Trump", n = 5000, include_rts = FALSE)

We then geocode them, or extract latitude and longitude, and map them:

coordB <- rtweet::lat_lng(biden)

coordT <- rtweet::lat_lng(trump)

par(mar = c(0, 0, 0, 0))
maps::map("state", lwd = .25)

with(coordB, points(lng, lat, pch = 20, cex = .75, col = "blue"))

with(coordT, points(lng, lat, pch = 20, cex = .75, col = "red"))

Resulting Map

Now we will collect tweets that mentions the users realDonaldTrump and JoeBIden:

rdt <- rtweet::search_tweets(q = "realDonaldTrump", n = 1000, lang = "en")

bid <- rtweet::search_tweets(q = "JoeBiden", n = 1000, lang = "en")

And then we check the most popular hashtags used when tweeting about Trump and Biden:

library(stringr)
dt <- str_extract_all(rdt$text, "#(\\d|\\w)+")
dt <- unlist(dt)
head(sort(table(ht), decreasing = TRUE))

jb <- str_extract_all(bid$text, "#(\\d|\\w)+")
jb <- unlist(jb)
head(sort(table(jb), decreasing = TRUE))

And the results are:

Trump	Biden
#AIDS (24)	#Trump (124)
#ExecutiveOrder (24)	#ExecutiveOrder (121)
#HIV (24)	#ObamaBiden (120)
#IdiotInChief (24)	#JoeBiden (22)
#Trump (24)	#Trump2020 (20)

And now we can check how many times their names are mentioned when tweeting about the other. As a bonus, we are also going to check how many times Obama’s name is mentioned when tweeting about them:

length(grep("obama", rdt$text, ignore.case=TRUE))  
[1] 78

length(grep("obama", bid$text, ignore.case=TRUE))  
[1] 302

length(grep("trump", bid$text, ignore.case=TRUE))  
[1] 491

length(grep("biden", rdt$text, ignore.case=TRUE))
[1] 112

We can then create a network plot to see based, for instance, on retweets to check user’s influence:

filter(rdt, retweet_count > 0 ) %>% 
  select(screen_name, mentions_screen_name) %>%
  unnest(mentions_screen_name) %>% 
  filter(!is.na(mentions_screen_name)) %>% 
  graph_from_data_frame() -> rdt_g
V(rdt_g)$node_label <- unname(ifelse(degree(rdt_g)[V(rdt_g)] > 20, names(V(rdt_g)), "")) 
V(rdt_g)$node_size <- unname(ifelse(degree(rdt_g)[V(rdt_g)] > 20, degree(rdt_g), 0)) 
ggraph(rdt_g, layout = 'kk') + 
  geom_edge_arc(edge_width=0.1, aes(alpha=..index..)) +
  geom_node_label(aes(label=node_label, size=node_size),
                  label.size=0, fill="#ffffff66", segment.colour="light blue",
                  color="red", repel=TRUE, family="Apple Garamond") +
  coord_fixed() +
  scale_size_area(trans="sqrt") +
  labs(title="Title", subtitle="Edges=volume of retweets. Screenname size=influence") +
  theme_graph(base_family="Apple Garamond") +
  theme(legend.position="none") 

Network Plot

Written on June 17, 2020