Store tweets in real time

Task: Store all geo-coded tweets from Sweden, in real time.

Construct the "locations" parameter

Construct a polygon covering Sweden.
Use GIS data from here: http://www.gis.scb.se/how/PXmap1/SCB_PXmap2007.zip
extract, into the a folder called "sweden-gis-data", these files:
- Sweden_municipality07.shp
- Sweden_municipality07.dbf
- Sweden_municipality07.shx
- Sweden_municipality07.prj

install the rgdal package

apt-get install libproj-dev libgdal1-dev

install.packages("rgdal")

apt-get install libgdal1
apt-get --purge remove libproj-dev libgdal1-dev
apt-get --purge autoremove

library(rgdal)
sweden <- readOGR("sweden-gis-data", layer = "Sweden_municipality07")

install rgeos

apt-get install libgeos-dev

install.packages("rgeos")
library("rgeos")

apt-get --purge remove libgeos-dev

install maptools

install.packages("maptools")
library(maptools)

merge the polygons of the municipalities into one big polygon

sweden.united <- unionSpatialPolygons(polygons(sweden), rep(1, length(sweden)))

Realize that the twitter api does not allow a polygon, only four points allowed!

No problem, just give a locations parameter that cover the whole of sweden, and filter out the tweets that falls outside of sweden.

Our object sweden already have an appropriate bounding box that we can use, just transform to longitude and latitude first.

bbox(spTransform(sweden, CRS("+proj=longlat")))
       min      max
x 11.13373 24.15473
y 55.34008 69.04941

tweets.df <- parseTweets(filterStream(file.name = "",
  locations = c(11.13373, 55.34008, 24.15473, 69.04941),
  tweets = 1000, oauth = twitCred), verbose = FALSE)

As it happens, twitter provides a variable "country" that we can filter on, so we do not have to filter by longitude and latitude if we don't want to, we can filter on a country string instead.

Do twitter classify country good enough for us?

sweden.united.longlat <- spTransform(sweden.united, CRS("+proj=longlat"))
plot(sweden.united.longlat)
these <- which(tweets.df$country == "Sverige")
swedes <- SpatialPointsDataFrame(coords = data.frame(tweets.df$place_lon[these],
 tweets.df$place_lat[these]), tweets.df[these, ], proj4string=CRS("+proj=longlat"))
points(swedes@coords, pch = 20)
those <- which(tweets.df$country != "Sverige")
non.swedes <- SpatialPointsDataFrame(coords = data.frame(
   tweets.df$place_lon[those], tweets.df$place_lat[those]),
 tweets.df[those, ], proj4string=CRS("+proj=longlat"))
points(non.swedes@coords, pch = 20, col = "red")
legend(x = "right", legend = c("Swedish", "not swedish"), col = c("black", "red"),
       pch = 20)

Seems good enough.

swedes.temp.df <- tweets.df[which(tweets.df$country == "Sverige"), ]

library(RMySQL)
con <- dbConnect("MySQL", "twitter", username="foo", password="bar")
## Escape chars that have special meaning in SQL with the function dbEscapeStrings()
swedes.df <- data.frame(text = dbEscapeStrings(con, swedes.temp.df$text),
             id = swedes.temp.df$id_str,
             in_reply_to = swedes.temp.df$in_reply_to_status_id_str,
             source = dbEscapeStrings(con, swedes.temp.df$source),
             timestamp = swedes.temp.df$created_at,
             user = swedes.temp.df$name,
             latitude = swedes.temp.df$place_lat,
             longitude = swedes.temp.df$place_lon)
## This assumes there exists a table tweets in the mysql database "twitter"
dbWriteTable(con, "tweets", swedes.df, row.names = FALSE, append = TRUE)

To continously capture data, try something like this:

my.twitter.func <- function() {
  tweets.df <- parseTweets(filterStream(file.name = "",
locations = c(11.13373, 55.34008, 24.15473, 69.04941),
tweets = 1000, oauth = twitCred), verbose = FALSE)
 swedes.temp.df <- tweets.df[which(tweets.df$country == "Sverige"), ]
 swedes.df <- data.frame(text = dbEscapeStrings(con, swedes.temp.df$text),
                         id = swedes.temp.df$id_str,
                         in_reply_to = swedes.temp.df$in_reply_to_status_id_str,
                         source = dbEscapeStrings(con, swedes.temp.df$source),
                         timestamp = swedes.temp.df$created_at,
                         user = swedes.temp.df$name,
                         latitude = swedes.temp.df$place_lat,
                         longitude = swedes.temp.df$place_lon)
 if(length(grep("tweets", dbListTables(con))) == 0) {
   ## create the table, run this only once
   dbWriteTable(con, "tweets", swedes.df, row.names = FALSE)
 } else {
   dbWriteTable(con, "tweets", swedes.df, row.names = FALSE, append = TRUE)
 }
}

## Loop indefinately
while(TRUE){
 my.twitter.func()
}
## This might loose a few tweets during the saving to disk procedure, but hey,
## we want the big picture, right?

comments powered by Disqus

Back to the index

Blog roll

R-bloggers, Debian Weekly

Last modified: oktober 12, 2017