do.call(rbind, list) for uneven number of column
I have a list, with each element being a character vector, of differing lengths I would like to bind the data as rows, so that the column names 'line up' and if there is extra data then create column and if there is missing data then create NAs
Below is a mock example of the data I am working with
x <- list()
x[[1]] <- letters[seq(2,20,by=2)]
names(x[[1]]) <- LETTERS[c(1:length(x[[1]]))]
x[[2]] <- letters[seq(3,20, by=3)]
names(x[[2]]) <- LETTERS[seq(3,20, by=3)]
x[[3]] <- letters[seq(4,20, by=4)]
names(x[[3]]) <- LETTERS[seq(4,20, by=4)]
The below line would normally be what I would do if I was sure that the format for each element was the same...
do.call(rbind,x)
I was hoping that someone had come up with a nice little solution that matches up the column names and fills in blanks with NA
s whilst adding new columns if in the binding process new columns are found...
rbind.fill
is an awesome function that does really well on list of data.frames. But IMHO, for this case, it could be done much faster when the list contains only (named) vectors.
The rbind.fill
way
require(plyr)
rbind.fill(lapply(x,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
A more straightforward way (and efficient for this scenario at least):
rbind.named.fill <- function(x) {
nam <- sapply(x, names)
unam <- unique(unlist(nam))
len <- sapply(x, length)
out <- vector("list", length(len))
for (i in seq_along(len)) {
out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]
}
setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam)
}
Basically, we get total unique names to form the columns of the final data.frame. Then, we create a list with length = input and just fill the rest of the values with NA
. This is probably the "trickiest" part as we've to match the names while filling NA. And then, we set names once finally to the columns (which can be set by reference using setnames
from data.table
package as well if need be).
Now to some benchmarking:
Data:
# generate some huge random data:
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- sample(letters, length(nam))
setNames(val, nam)
}
ll <- replicate(1e4, sample.fun())
Functions:
# plyr's rbind.fill version:
rbind.fill.plyr <- function(x) {
rbind.fill(lapply(x,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
}
rbind.named.fill <- function(x) {
nam <- sapply(x, names)
unam <- unique(unlist(nam))
len <- sapply(x, length)
out <- vector("list", length(len))
for (i in seq_along(len)) {
out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]
}
setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam)
}
Update (added GSee's function as well):
foo <- function (...)
{
dargs <- list(...)
all.names <- unique(names(unlist(dargs)))
out <- do.call(rbind, lapply(dargs, `[`, all.names))
colnames(out) <- all.names
as.data.frame(out, stringsAsFactors=FALSE)
}
Benchmarking:
require(microbenchmark)
microbenchmark(t1 <- rbind.named.fill(ll),
t2 <- rbind.fill.plyr(ll),
t3 <- do.call(foo, ll), times=10)
identical(t1, t2) # TRUE
identical(t1, t3) # TRUE
Unit: milliseconds
expr min lq median uq max neval
t1 <- rbind.named.fill(ll) 243.0754 258.4653 307.2575 359.4332 385.6287 10
t2 <- rbind.fill.plyr(ll) 16808.3334 17139.3068 17648.1882 17890.9384 18220.2534 10
t3 <- do.call(foo, ll) 188.5139 204.2514 229.0074 339.6309 359.4995 10
If you want the result to be a matrix...
I recently wrote this function for a co-worker that wanted to rbind vectors into a matrix.
foo <- function (...)
{
dargs <- list(...)
if (!all(vapply(dargs, is.vector, TRUE)))
stop("all inputs must be vectors")
if (!all(vapply(dargs, function(x) !is.null(names(x)), TRUE)))
stop("all input vectors must be named.")
all.names <- unique(names(unlist(dargs)))
out <- do.call(rbind, lapply(dargs, `[`, all.names))
colnames(out) <- all.names
out
}
R > do.call(foo, x)
A B C D E F G H I J L O R P T
[1,] "b" "d" "f" "h" "j" "l" "n" "p" "r" "t" NA NA NA NA NA
[2,] NA NA "c" NA NA "f" NA NA "i" NA "l" "o" "r" NA NA
[3,] NA NA NA "d" NA NA NA "h" NA NA "l" NA NA "p" "t"
Here's a version using the package data.table
, a bit faster for very big data.
It uses the function rbindlist
and its argument fill=TRUE
passed to the function do.call
.
library(data.table)
x <- list()
x[[1]] <- letters[seq(2,20,by=2)]
names(x[[1]]) <- LETTERS[c(1:length(x[[1]]))]
x[[2]] <- letters[seq(3,20, by=3)]
names(x[[2]]) <- LETTERS[seq(3,20, by=3)]
x[[3]] <- letters[seq(4,20, by=4)]
names(x[[3]]) <- LETTERS[seq(4,20, by=4)]
x2 <- lapply(x, as.list)
rbindlist(x2, fill=TRUE)
#> A B C D E F G H I J L O R P T
#> 1: b d f h j l n p r t <NA> <NA> <NA> <NA> <NA>
#> 2: <NA> <NA> c <NA> <NA> f <NA> <NA> i <NA> l o r <NA> <NA>
#> 3: <NA> <NA> <NA> d <NA> <NA> <NA> h <NA> <NA> l <NA> <NA> p t
It adds a small overhead because it needs the character vectors to be converted with as.list
. This passage can also add time to the process, depending on how the data is generated.
On the other hand, it seems to perform faster on large datasets.
It returns a data.table
.
I rewrote @Arun and @GSee's examples to generate a bigger sample.
Data
# generate some huge random data:
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- sample(letters, length(nam))
setNames(val, nam)
}
l1 <- replicate(1e6, sample.fun()) # Arun's data, just bigger
l2 <- lapply(l1, as.list) # same data converted with as.list
Functions
library(microbenchmark)
library(data.table)
# Arun's function
rbind.named.fill <- function(x) {
nam <- sapply(x, names)
unam <- unique(unlist(nam))
len <- sapply(x, length)
out <- vector("list", length(len))
for (i in seq_along(len)) {
out[[i]] <- unname(x[[i]])[match(unam, nam[[i]])]
}
setNames(as.data.frame(do.call(rbind, out), stringsAsFactors=FALSE), unam)
}
# GSee's function
foo <- function (...)
{
dargs <- list(...)
all.names <- unique(names(unlist(dargs)))
out <- do.call(rbind, lapply(dargs, `[`, all.names))
colnames(out) <- all.names
as.data.frame(out, stringsAsFactors=FALSE)
}
Benchmarking
microbenchmark(t1 <- rbind.named.fill(l1),
t2 <- rbindlist(l2, fill=TRUE),
t3 <- do.call(foo, l1),
times=10)
#> Unit: seconds
#> expr min lq mean median uq max neval
#> t1 <- rbind.named.fill(l1) 6.536782 7.545538 9.118771 9.304844 10.505814 11.28260 10
#> t2 <- rbindlist(l2, fill=TRUE) 5.250387 5.787712 6.910340 6.226065 7.579503 10.40524 10
#> t3 <- do.call(foo, l1) 9.590615 11.043557 13.504694 12.550535 15.364464 19.95877 10
identical(t1, data.frame(t2))
#> [1] TRUE
identical(t3, data.frame(t2))
#> [1] TRUE
Created on 2019-08-01 by the reprex package (v0.3.0)
After you convert your names vector to individual dataframes, you can use dplyr::bind_rows
dplyr::bind_rows(lapply(x,function(y) as.data.frame(t(y),stringsAsFactors=FALSE)))
# A B C D E F G H I J L O R P T
#1 b d f h j l n p r t <NA> <NA> <NA> <NA> <NA>
#2 <NA> <NA> c <NA> <NA> f <NA> <NA> i <NA> l o r <NA> <NA>
#3 <NA> <NA> <NA> d <NA> <NA> <NA> h <NA> <NA> l <NA> <NA> p t
We can also use purrr::map_df
/purrr::map_dfr
in this case
purrr::map_df(x, ~as.data.frame(t(.x),stringsAsFactors = FALSE))
This will give the same output as above.