Select the first and last row by group in a data frame

Solution 1:

A fast and short data.table solution :

tmp[, .SD[c(1,.N)], by=id]

where .SD represents each (S)ubset of (D)ata, .N is the number of rows in each group and tmp is a data.table; e.g. as provided by fread() by default or by converting a data.frame using setDT().

Note that if a group only contains one row, that row will appear twice in the output because that row is both the first and last row of that group. To avoid the repetition in that case, thanks to @Thell:

tmp[, .SD[unique(c(1,.N))], by=id]

Alternatively, the following makes the logic explicit for the .N==1 special case :

tmp[, if (.N==1) .SD else .SD[c(1,.N)], by=id]

You don't need .SD[1] in the first part of the if because in that case .N is 1 so .SD must be just one row anyway.

You can wrap j in {} and have a whole page of code inside {} if you like. Just as long as the last expression inside {} returns a list- like object to be stacked (such as a plain list, data.table or data.frame).

tmp[, { ...; if (.N==1) .SD else .SD[c(1,.N)] } , by=id]

Solution 2:

A plyr solution (tmp is your data frame):

library("plyr")
ddply(tmp, .(id), function(x) x[c(1, nrow(x)), ])
#    id d gr  mm area
# 1  15 1  2 3.4    1
# 2  15 1  1 5.5    2
# 3  21 1  1 4.0    2
# 4  21 1  2 3.8    2
# 5  22 1  1 4.0    2
# 6  22 1  2 4.6    2
# 7  23 1  1 2.7    2
# 8  23 1  2 3.0    2
# 9  24 1  1 3.0    2
# 10 24 1  2 2.0    3

Or with dplyr (see also here):

library("dplyr")
tmp %>%
group_by(id) %>%
slice(c(1, n())) %>%
ungroup()
# # A tibble: 10 × 5
#       id     d    gr    mm  area
#    <int> <int> <int> <dbl> <int>
# 1     15     1     2   3.4     1
# 2     15     1     1   5.5     2
# 3     21     1     1   4.0     2
# 4     21     1     2   3.8     2
# 5     22     1     1   4.0     2
# 6     22     1     2   4.6     2
# 7     23     1     1   2.7     2
# 8     23     1     2   3.0     2
# 9     24     1     1   3.0     2
# 10    24     1     2   2.0     3

Solution 3:

Here is a solution in base R. If there are multiple groups with the same id this code returns the first and last row for each of those individual groups.

EDIT: January 12, 2017

This solution might be a little more intuitive than my other answer farther below:

lmy.df = read.table(text = '
     id    d    gr     mm  area
     15    1     2   3.40     1
     15    1     1   4.90     2
     15    1     1   4.40     1
     15    1     1   5.50     2
     21    1     1   4.00     2
     21    1     2   3.80     2
     22    1     1   4.00     2
     23    1     1   2.70     2
     23    1     1   4.00     2
     23    1     2   3.00     2
     24    1     1   3.00     2
     24    1     1   2.00     3
     24    1     1   4.00     2
     24    1     2   2.00     3
', header = TRUE)

head <- aggregate(lmy.df, by=list(lmy.df$id), FUN = function(x) { first = head(x,1) } )
tail <- aggregate(lmy.df, by=list(lmy.df$id), FUN = function(x) {  last = tail(x,1) } )
head$order = 'first'
tail$order = 'last'

my.output <- rbind(head, tail)
my.output
#   Group.1 id d gr  mm area order
#1       15 15 1  2 3.4    1 first
#2       21 21 1  1 4.0    2 first
#3       22 22 1  1 4.0    2 first
#4       23 23 1  1 2.7    2 first
#5       24 24 1  1 3.0    2 first
#6       15 15 1  1 5.5    2  last
#7       21 21 1  2 3.8    2  last
#8       22 22 1  1 4.0    2  last
#9       23 23 1  2 3.0    2  last
#10      24 24 1  2 2.0    3  last

EDIT: June 18, 2016

Since posting my original answer I have learned it is better to use lapply than apply. This is because apply does not work if every group has the same number of rows. See here: Error when numbering rows by group

lmy.df = read.table(text = '
     id    d    gr     mm  area
     15    1     2   3.40     1
     15    1     1   4.90     2
     15    1     1   4.40     1
     15    1     1   5.50     2
     21    1     1   4.00     2
     21    1     2   3.80     2
     22    1     1   4.00     2
     23    1     1   2.70     2
     23    1     1   4.00     2
     23    1     2   3.00     2
     24    1     1   3.00     2
     24    1     1   2.00     3
     24    1     1   4.00     2
     24    1     2   2.00     3
', header = TRUE)


lmy.seq <- rle(lmy.df$id)$lengths
lmy.df$first <- unlist(lapply(lmy.seq, function(x) seq(1,x)))
lmy.df$last  <- unlist(lapply(lmy.seq, function(x) seq(x,1,-1)))
lmy.df

lmy.df2 <- lmy.df[lmy.df$first==1 | lmy.df$last == 1,]
lmy.df2

#   id d gr  mm area first last
#1  15 1  2 3.4    1     1    4
#4  15 1  1 5.5    2     4    1
#5  21 1  1 4.0    2     1    2
#6  21 1  2 3.8    2     2    1
#7  22 1  1 4.0    2     1    1
#8  23 1  1 2.7    2     1    3
#10 23 1  2 3.0    2     3    1
#11 24 1  1 3.0    2     1    4
#14 24 1  2 2.0    3     4    1

Here is an example in which each group has two rows:

lmy.df = read.table(text = '
     id    d    gr     mm  area
     15    1     2   3.40     1
     15    1     1   4.90     2
     21    1     1   4.00     2
     21    1     2   3.80     2
     22    1     1   4.00     2
     22    1     1   6.00     2
     23    1     1   2.70     2
     23    1     2   3.00     2
     24    1     1   3.00     2
     24    1     2   2.00     3
', header = TRUE)

lmy.seq <- rle(lmy.df$id)$lengths

lmy.df$first <- unlist(lapply(lmy.seq, function(x) seq(1,x)))
lmy.df$last  <- unlist(lapply(lmy.seq, function(x) seq(x,1,-1)))
lmy.df

lmy.df2 <- lmy.df[lmy.df$first==1 | lmy.df$last == 1,]
lmy.df2

#   id d gr  mm area first last
#1  15 1  2 3.4    1     1    2
#2  15 1  1 4.9    2     2    1
#3  21 1  1 4.0    2     1    2
#4  21 1  2 3.8    2     2    1
#5  22 1  1 4.0    2     1    2
#6  22 1  1 6.0    2     2    1
#7  23 1  1 2.7    2     1    2
#8  23 1  2 3.0    2     2    1
#9  24 1  1 3.0    2     1    2
#10 24 1  2 2.0    3     2    1

Original answer:

my.seq <- data.frame(rle(my.df$id)$lengths)

my.df$first <- unlist(apply(my.seq, 1, function(x) seq(1,x)))
my.df$last  <- unlist(apply(my.seq, 1, function(x) seq(x,1,-1)))

my.df2 <- my.df[my.df$first==1 | my.df$last == 1,]
my.df2

   id d gr  mm area first last
1  15 1  2 3.4    1     1    4
4  15 1  1 5.5    2     4    1
5  21 1  1 4.0    2     1    2
6  21 1  2 3.8    2     2    1
7  22 1  1 4.0    2     1    3
9  22 1  2 4.6    2     3    1
10 23 1  1 2.7    2     1    3
12 23 1  2 3.0    2     3    1
13 24 1  1 3.0    2     1    4
16 24 1  2 2.0    3     4    1