"-"
CREATE TABLE `debug_tweets` LIKE `tweets`; INSERT INTO `debug_tweets` SELECT * FROM `tweets` where id in (select twitterid from ids where id > 0 and id < 10000);
CREATE TABLE `debug_tweets` SELECT * FROM `tweets` where id in (select twitterid from ids where id > 0 and id < 1000);
alter table debug_tweets ADD COLUMN random_id int(11) FIRST;
select random_id, id from debug_tweets INTO OUTFILE '/tmp/test3.csv' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';
select id from tweets INTO OUTFILE '/tmp/ids.csv';
bar <- read.table("/tmp/test5.csv", sep = ",", na.strings = "\\N", colClasses = c("numeric", "character")) bar$V1 <- rid[1:9999] colnames(bar) <- c("random_id", "id") dbWriteTable(con2, "temp", bar, row.names=FALSE, field.types=c(random_id="int(11)", id="varchar(20)"))
update debug_tweets inner join temp on debug_tweets.id = temp.id set debug_tweets.random_id = temp.random_id;
Add a mapping from twitterid to an gap-less integer vector. This makes it possible to adress all rows in tweets, which is needed when the full twitterid vector is too large to fit in RAM. Using the mapping, one can manipulate one slice at a time.
update debug_tweets inner join temp on debug_tweets.id = temp.id set debug_tweets.random_id = temp.random_id;
Script this, use slices of 1E6 elements
slice.endpoints <- seq(from = 0, to = length(rid), by = 1E6) for(i in 1:(length(slice.endpoints)-1)){ bar <- read.table("/tmp/ids.csv", colClasses = "character", skip=slice.endpoints[i], nrows=slice.endpoints[i+1]) bar$random_id <- rid[(slice.endpoints[i]+1):slice.endpoints[i+1]] colnames(bar) <- c("random_id", "id") dbExecute(con2, "drop table temp") dbWriteTable(con2, "temp", bar, row.names=FALSE, field.types=c(random_id="int(11)", id="varchar(20)")) }