Select the first row by group
From a dataframe like this
test <- data.frame('id'= rep(1:5,2), 'string'= LETTERS[1:10])
test <- test[order(test$id), ]
rownames(test) <- 1:10
> test
id string
1 1 A
2 1 F
3 2 B
4 2 G
5 3 C
6 3 H
7 4 D
8 4 I
9 5 E
10 5 J
I want to create a new one with the first row of each id / string pair. If sqldf accepted R code within it, the query could look like this:
res <- sqldf("select id, min(rownames(test)), string
from test
group by id, string")
> res
id string
1 1 A
3 2 B
5 3 C
7 4 D
9 5 E
Is there a solution short of creating a new column like
test$row <- rownames(test)
and running the same sqldf query with min(row)?
You can use duplicated
to do this very quickly.
test[!duplicated(test$id),]
Benchmarks, for the speed freaks:
ju <- function() test[!duplicated(test$id),]
gs1 <- function() do.call(rbind, lapply(split(test, test$id), head, 1))
gs2 <- function() do.call(rbind, lapply(split(test, test$id), `[`, 1, ))
jply <- function() ddply(test,.(id),function(x) head(x,1))
jdt <- function() {
testd <- as.data.table(test)
setkey(testd,id)
# Initial solution (slow)
# testd[,lapply(.SD,function(x) head(x,1)),by = key(testd)]
# Faster options :
testd[!duplicated(id)] # (1)
# testd[, .SD[1L], by=key(testd)] # (2)
# testd[J(unique(id)),mult="first"] # (3)
# testd[ testd[,.I[1L],by=id] ] # (4) needs v1.8.3. Allows 2nd, 3rd etc
}
library(plyr)
library(data.table)
library(rbenchmark)
# sample data
set.seed(21)
test <- data.frame(id=sample(1e3, 1e5, TRUE), string=sample(LETTERS, 1e5, TRUE))
test <- test[order(test$id), ]
benchmark(ju(), gs1(), gs2(), jply(), jdt(),
replications=5, order="relative")[,1:6]
# test replications elapsed relative user.self sys.self
# 1 ju() 5 0.03 1.000 0.03 0.00
# 5 jdt() 5 0.03 1.000 0.03 0.00
# 3 gs2() 5 3.49 116.333 2.87 0.58
# 2 gs1() 5 3.58 119.333 3.00 0.58
# 4 jply() 5 3.69 123.000 3.11 0.51
Let's try that again, but with just the contenders from the first heat and with more data and more replications.
set.seed(21)
test <- data.frame(id=sample(1e4, 1e6, TRUE), string=sample(LETTERS, 1e6, TRUE))
test <- test[order(test$id), ]
benchmark(ju(), jdt(), order="relative")[,1:6]
# test replications elapsed relative user.self sys.self
# 1 ju() 100 5.48 1.000 4.44 1.00
# 2 jdt() 100 6.92 1.263 5.70 1.15
I favor the dplyr approach.
group_by(id)
followed by either
-
filter(row_number()==1)
or -
slice(1)
or -
slice_head(1)
#(dplyr => 1.0) -
top_n(n = -1)
-
top_n()
internally uses the rank function. Negative selects from the bottom of rank.
-
In some instances arranging the ids after the group_by can be necessary.
library(dplyr)
# using filter(), top_n() or slice()
m1 <-
test %>%
group_by(id) %>%
filter(row_number()==1)
m2 <-
test %>%
group_by(id) %>%
slice(1)
m3 <-
test %>%
group_by(id) %>%
top_n(n = -1)
All three methods return the same result
# A tibble: 5 x 2
# Groups: id [5]
id string
<int> <fct>
1 1 A
2 2 B
3 3 C
4 4 D
5 5 E
What about
DT <- data.table(test)
setkey(DT, id)
DT[J(unique(id)), mult = "first"]
Edit
There is also a unique method for data.tables
which will return the the first row by key
jdtu <- function() unique(DT)
I think, if you are ordering test
outside the benchmark, then you can removing the setkey
and data.table
conversion from the benchmark as well (as the setkey basically sorts by id, the same as order
).
set.seed(21)
test <- data.frame(id=sample(1e3, 1e5, TRUE), string=sample(LETTERS, 1e5, TRUE))
test <- test[order(test$id), ]
DT <- data.table(DT, key = 'id')
ju <- function() test[!duplicated(test$id),]
jdt <- function() DT[J(unique(id)),mult = 'first']
library(rbenchmark)
benchmark(ju(), jdt(), replications = 5)
## test replications elapsed relative user.self sys.self
## 2 jdt() 5 0.01 1 0.02 0
## 1 ju() 5 0.05 5 0.05 0
and with more data
** Edit with unique method**
set.seed(21)
test <- data.frame(id=sample(1e4, 1e6, TRUE), string=sample(LETTERS, 1e6, TRUE))
test <- test[order(test$id), ]
DT <- data.table(test, key = 'id')
test replications elapsed relative user.self sys.self
2 jdt() 5 0.09 2.25 0.09 0.00
3 jdtu() 5 0.04 1.00 0.05 0.00
1 ju() 5 0.22 5.50 0.19 0.03
The unique method is fastest here.
A simple ddply
option:
ddply(test,.(id),function(x) head(x,1))
If speed is an issue, a similar approach could be taken with data.table
:
testd <- data.table(test)
setkey(testd,id)
testd[,.SD[1],by = key(testd)]
or this might be considerably faster:
testd[testd[, .I[1], by = key(testd]$V1]