Twitter Data
This example scrapes Twitter data, visualizes it, and looks at some descriptive information:
-
First, we install the rtweet package:
install.packages("rtweet") library(ggplot2) library(rtweet) library(igraph) library(tidyverse) library(ggraph)
-
Second, we create the Twitter token:
token <- rtweet::create_token( app = "APPNAME", consumer_key <- "YOURKEY", consumer_secret <- "YOURSECRETKEY", access_token <- "...", access_secret <- "...")
Obs: You need a Twitter developer account for this.
-
We collect tweets for a specific subject or user, in this case we will collect tweets that mention just the names Biden and Trump:
biden <- rtweet::search_tweets("Biden", n = 5000, include_rts = FALSE) trump <- rtweet::search_tweets("Trump", n = 5000, include_rts = FALSE)
-
We then geocode them, or extract latitude and longitude, and map them:
coordB <- rtweet::lat_lng(biden)
coordT <- rtweet::lat_lng(trump)
par(mar = c(0, 0, 0, 0))
maps::map("state", lwd = .25)
with(coordB, points(lng, lat, pch = 20, cex = .75, col = "blue"))
with(coordT, points(lng, lat, pch = 20, cex = .75, col = "red"))
- Now we will collect tweets that mentions the users realDonaldTrump and JoeBIden:
rdt <- rtweet::search_tweets(q = "realDonaldTrump", n = 1000, lang = "en")
bid <- rtweet::search_tweets(q = "JoeBiden", n = 1000, lang = "en")
- And then we check the most popular hashtags used when tweeting about Trump and Biden:
library(stringr)
dt <- str_extract_all(rdt$text, "#(\\d|\\w)+")
dt <- unlist(dt)
head(sort(table(ht), decreasing = TRUE))
jb <- str_extract_all(bid$text, "#(\\d|\\w)+")
jb <- unlist(jb)
head(sort(table(jb), decreasing = TRUE))
And the results are:
Trump | Biden |
---|---|
#AIDS (24) | #Trump (124) |
#ExecutiveOrder (24) | #ExecutiveOrder (121) |
#HIV (24) | #ObamaBiden (120) |
#IdiotInChief (24) | #JoeBiden (22) |
#Trump (24) | #Trump2020 (20) |
- And now we can check how many times their names are mentioned when tweeting about the other. As a bonus, we are also going to check how many times Obama’s name is mentioned when tweeting about them:
length(grep("obama", rdt$text, ignore.case=TRUE))
[1] 78
length(grep("obama", bid$text, ignore.case=TRUE))
[1] 302
length(grep("trump", bid$text, ignore.case=TRUE))
[1] 491
length(grep("biden", rdt$text, ignore.case=TRUE))
[1] 112
- We can then create a network plot to see based, for instance, on retweets to check user’s influence:
filter(rdt, retweet_count > 0 ) %>%
select(screen_name, mentions_screen_name) %>%
unnest(mentions_screen_name) %>%
filter(!is.na(mentions_screen_name)) %>%
graph_from_data_frame() -> rdt_g
V(rdt_g)$node_label <- unname(ifelse(degree(rdt_g)[V(rdt_g)] > 20, names(V(rdt_g)), ""))
V(rdt_g)$node_size <- unname(ifelse(degree(rdt_g)[V(rdt_g)] > 20, degree(rdt_g), 0))
ggraph(rdt_g, layout = 'kk') +
geom_edge_arc(edge_width=0.1, aes(alpha=..index..)) +
geom_node_label(aes(label=node_label, size=node_size),
label.size=0, fill="#ffffff66", segment.colour="light blue",
color="red", repel=TRUE, family="Apple Garamond") +
coord_fixed() +
scale_size_area(trans="sqrt") +
labs(title="Title", subtitle="Edges=volume of retweets. Screenname size=influence") +
theme_graph(base_family="Apple Garamond") +
theme(legend.position="none")
Written on June 17, 2020