MariaDB Cheat Sheet

character to avoid in table names

"-"

make a small copy of a table to debug on

With indices

CREATE TABLE `debug_tweets` LIKE `tweets`;
INSERT INTO `debug_tweets` SELECT * FROM `tweets` where id in (select twitterid from ids where id > 0 and id < 10000);

without indices

CREATE TABLE `debug_tweets` SELECT * FROM `tweets` where id in (select twitterid from ids where id > 0 and id < 1000);

add a column, data in an external text file

Change the structure, add an empty column

alter table debug_tweets ADD COLUMN random_id int(11) FIRST;

write out the table to a csv-file

select random_id, id from debug_tweets INTO OUTFILE '/tmp/test3.csv' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n';

select id from tweets INTO OUTFILE '/tmp/ids.csv';

import to R, assign random_id, export

bar <- read.table("/tmp/test5.csv", sep = ",", na.strings = "\\N", colClasses = c("numeric", "character")) bar$V1 <- rid[1:9999] colnames(bar) <- c("random_id", "id") dbWriteTable(con2, "temp", bar, row.names=FALSE, field.types=c(random_id="int(11)", id="varchar(20)"))

Fill the empty column with data

update debug_tweets inner join temp on debug_tweets.id = temp.id set debug_tweets.random_id = temp.random_id;

Add a mapping from twitterid to an gap-less integer vector. This makes it possible to adress all rows in tweets, which is needed when the full twitterid vector is too large to fit in RAM. Using the mapping, one can manipulate one slice at a time.

export the vector of twitterids: select id from tweets INTO OUTFILE '/tmp/ids.csv';
create a vector of random_id in R: rid <- sample.int(n=nrow) where nrow is the number of ids in tweets.
import a slice of that vector into R. bar <- read.table("/tmp/ids.csv", colClasses = "character", skip=0, nrows=10000000)
add the random_id from rid to that slice: bar$random_id <- rid[1:1000000]
colnames(bar) <- c("random_id", "id")
dbWriteTable(con2, "temp", bar, row.names=FALSE, field.types=c(random_id="int(11)", id="varchar(20)"))

update debug_tweets inner join temp on debug_tweets.id = temp.id set debug_tweets.random_id = temp.random_id;

Script this, use slices of 1E6 elements

slice.endpoints <- seq(from = 0, to = length(rid), by = 1E6) for(i in 1:(length(slice.endpoints)-1)){ bar <- read.table("/tmp/ids.csv", colClasses = "character", skip=slice.endpoints[i], nrows=slice.endpoints[i+1]) bar$random_id <- rid[(slice.endpoints[i]+1):slice.endpoints[i+1]] colnames(bar) <- c("random_id", "id") dbExecute(con2, "drop table temp") dbWriteTable(con2, "temp", bar, row.names=FALSE, field.types=c(random_id="int(11)", id="varchar(20)")) }

comments powered by Disqus

Back to the index

Blog roll

R-bloggers, Debian Weekly

Last modified: oktober 17, 2019