Reshape a dataframe to long format with multiple sets of measure columns [duplicate]
Using the new feature in melt
from data.table v1.9.5+
:
require(data.table) # v1.9.5+
melt(setDT(df), measure = patterns("^year", "^pop"), value.name = c("year", "pop"))
You can find the rest of the vignettes here.
If the 'year', 'pop', columns are alternating, we can subset with c(TRUE, FALSE)
to get the columns 1, 3, 5,..etc. and c(FALSE, TRUE)
to get 2, 4, 6,.. due to the recycling. Then, we unlist
the columns and create a new 'data.frame.
df2 <- data.frame(year=unlist(df1[c(TRUE, FALSE)]),
pop=unlist(df1[c(FALSE, TRUE)]))
row.names(df2) <- NULL
head(df2)
# year pop
#1
#2 16XX 4675,0
#3 17XX 4739,3
#4 17XX 4834,0
#5 180X 4930,0
#6 180X 5029,0
Or another option is
library(splitstackshape)
merged.stack(transform(df1, id=1:nrow(df1)), var.stubs=c('year', 'pop'),
sep='var.stubs')[order(.time_1), 3:4, with=FALSE]
data
df1 <- structure(list(year1 = c("", "16XX", "17XX", "17XX", "180X",
"180X", "181X", "181X", "182X", "182X"), pop1 = c("", "4675,0",
"4739,3", "4834,0", "4930,0", "5029,0", "5129,0", "5231,9", "5297,0",
"5362,0"), year2 = c(NA, 1900L, 1901L, 1902L, 1903L, 1904L, 1905L,
1906L, 1907L, 1908L), pop2 = c("", "6453,0", "6553,5", "6684,0",
"6818,0", "6955,0", "7094,0", "7234,7", "7329,0", "7422,0"),
year3 = c(NA, 1930L, 1931L, 1932L, 1933L, 1934L, 1935L, 1936L,
1937L, 1938L), pop3 = c("", "9981,2", "", "", "", "", "",
"", "", "")), .Names = c("year1", "pop1", "year2", "pop2",
"year3", "pop3"), class = "data.frame", row.names = c(NA, -10L))
Another option is to use split.default
to split the dataframe in a list of dataframes and then bind them together:
lst <- lapply(split.default(df1, sub('.*(\\d)', '\\1', names(df1))),
setNames, c('year','pop'))
do.call(rbind, lst)
which gives the desired result:
year pop 1.1 16XX 4675,0 1.2 17XX 4739,3 1.3 17XX 4834,0 1.4 180X 4930,0 1.5 180X 5029,0 1.6 181X 5129,0 1.7 181X 5231,9 1.8 182X 5297,0 1.9 182X 5362,0 2.1 1900 6453,0 2.2 1901 6553,5 2.3 1902 6684,0 2.4 1903 6818,0 2.5 1904 6955,0 2.6 1905 7094,0 2.7 1906 7234,7 2.8 1907 7329,0 2.9 1908 7422,0 3.1 1930 9981,2 3.2 1931 10583,5 3.3 1932 8671,0 3.4 1933 9118,0 3.5 1934 9625,0 3.6 1935 8097,0 3.7 1936 7984,7 3.8 1937 8729,0 3.9 1938 10462,0
You could also use rbindlist
from the data.table
package for the last step:
library(data.table)
rbindlist(lst)
Used data:
df1 <- structure(list(year1 = c("16XX", "17XX", "17XX", "180X", "180X", "181X", "181X", "182X", "182X"),
pop1 = c("4675,0", "4739,3", "4834,0", "4930,0", "5029,0", "5129,0", "5231,9", "5297,0", "5362,0"),
year2 = c(1900L, 1901L, 1902L, 1903L, 1904L, 1905L, 1906L, 1907L, 1908L),
pop2 = c("6453,0", "6553,5", "6684,0", "6818,0", "6955,0", "7094,0", "7234,7", "7329,0", "7422,0"),
year3 = c(1930L, 1931L, 1932L, 1933L, 1934L, 1935L, 1936L, 1937L, 1938L),
pop3 = c("9981,2", "10583,5", "8671,0", "9118,0", "9625,0", "8097,0", "7984,7", "8729,0", "10462,0")),
.Names = c("year1", "pop1", "year2", "pop2", "year3", "pop3"), class = "data.frame", row.names = c(NA, -9L))