dplyr - filter by group size
What is the best way to filter a data.frame to only get groups of say size 5?
So my data looks as follows:
require(dplyr)
n <- 1e5
x <- rnorm(n)
# Category size ranging each from 1 to 5
cat <- rep(seq_len(n/3), sample(1:5, n/3, replace = TRUE))[1:n]
dat <- data.frame(x = x, cat = cat)
The dplyr way i could come up with was
dat <- group_by(dat, cat)
system.time({
out1 <- dat %>% filter(n() == 5L)
})
# user system elapsed
# 1.157 0.218 1.497
But this is very slow... Is there a better way in dplyr?
So far my workaround solutions looks as follows:
system.time({
all_ind <- rep(seq_len(n_groups(dat)), group_size(dat))
take_only <- which(group_size(dat) == 5L)
out2 <- dat[all_ind %in% take_only, ]
})
# user system elapsed
# 0.026 0.008 0.036
all.equal(out1, out2) # TRUE
But this doesn't feel very dplyr like...
Solution 1:
You can do it more concisely with n()
:
library(dplyr)
dat %>% group_by(cat) %>% filter(n() == 5)
Solution 2:
Here's another dplyr approach you can try
semi_join(dat, count(dat, cat) %>% filter(n == 5), by = "cat")
--
Here's another approach based on OP's original approach with a little modification:
n <- 1e5
x <- rnorm(n)
# Category size ranging each from 1 to 5
cat <- rep(seq_len(n/3), sample(1:5, n/3, replace = TRUE))[1:n]
dat <- data.frame(x = x, cat = cat)
# second data set for the dt approch
dat2 <- data.frame(x = x, cat = cat)
sol_floo0 <- function(dat){
dat <- group_by(dat, cat)
all_ind <- rep(seq_len(n_groups(dat)), group_size(dat))
take_only <- which(group_size(dat) == 5L)
dat[all_ind %in% take_only, ]
}
sol_floo0_v2 <- function(dat){
g <- group_by(dat, cat) %>% group_size()
ind <- rep(g == 5, g)
dat[ind, ]
}
microbenchmark::microbenchmark(times = 10,
sol_floo0(dat),
sol_floo0_v2(dat2))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# sol_floo0(dat) 43.72903 44.89957 45.71121 45.10773 46.59019 48.64595 10 b
# sol_floo0_v2(dat2) 29.83724 30.56719 32.92777 31.97169 34.10451 38.31037 10 a
all.equal(sol_floo0(dat), sol_floo0_v2(dat2))
#[1] TRUE
Solution 3:
I know you asked for a dplyr
solution but if you combine it with some purrr
you can get it in one line without specifying any new functions. (A little slower though.)
library(dplyr)
library(purrr)
library(tidyr)
dat %>%
group_by(cat) %>%
nest() %>%
mutate(n = map(data, n_distinct)) %>%
unnest(n = n) %>%
filter(n == 5) %>%
select(cat, n)
Solution 4:
Comparing the answers timewise:
require(dplyr)
require(data.table)
n <- 1e5
x <- rnorm(n)
# Category size ranging each from 1 to 5
cat <- rep(seq_len(n/3), sample(1:5, n/3, replace = TRUE))[1:n]
dat <- data.frame(x = x, cat = cat)
# second data set for the dt approch
dat2 <- data.frame(x = x, cat = cat)
sol_floo0 <- function(dat){
dat <- group_by(dat, cat)
all_ind <- rep(seq_len(n_groups(dat)), group_size(dat))
take_only <- which(group_size(dat) == 5L)
dat[all_ind %in% take_only, ]
}
sol_floo0_v2 <- function(dat){
g <- group_by(dat, cat) %>% group_size()
ind <- rep(g == 5, g)
dat[ind, ]
}
sol_docendo_discimus <- function(dat){
dat <- group_by(dat, cat)
semi_join(dat, count(dat, cat) %>% filter(n == 5), by = "cat")
}
sol_akrun <- function(dat2){
setDT(dat2)[dat2[, .I[.N==5], by = cat]$V1]
}
sol_sotos <- function(dat2){
setDT(dat2)[, if(.N == 5) .SD, by = cat]
}
sol_chirayu_chamoli <- function(dat){
rle_ <- rle(dat$cat)
dat[dat$cat %in% rle_$values[rle_$lengths==5], ]
}
microbenchmark::microbenchmark(times = 20,
sol_floo0(dat),
sol_floo0_v2(dat),
sol_docendo_discimus(dat),
sol_akrun(dat2),
sol_sotos(dat2),
sol_chirayu_chamoli(dat))
Results in:
Unit: milliseconds
expr min lq mean median uq max neval cld
sol_floo0(dat) 58.00439 65.28063 93.54014 69.82658 82.79997 280.23114 20 cd
sol_floo0_v2(dat) 42.27791 50.27953 72.51729 58.63931 67.62540 238.97413 20 bc
sol_docendo_discimus(dat) 100.54095 113.15476 126.74142 121.69013 132.62533 183.05818 20 d
sol_akrun(dat2) 26.88369 34.01925 41.04378 37.07957 45.44784 63.95430 20 ab
sol_sotos(dat2) 16.10177 19.78403 24.04375 23.06900 28.05470 35.83611 20 a
sol_chirayu_chamoli(dat) 20.67951 24.18100 38.01172 27.61618 31.97834 230.51026 20 ab
Solution 5:
I generalised the function written by docendo discimus, to use it alongside existing dplyr functions:
#' inherit dplyr::filter
#' @param min minimal group size, use \code{min = NULL} to filter on maximal group size only
#' @param max maximal group size, use \code{max = NULL} to filter on minimal group size only
#' @export
#' @source Stack Overflow answer by docendo discimus, \url{https://stackoverflow.com/a/43110620/4575331}
filter_group_size <- function(.data, min = NULL, max = min) {
g <- dplyr::group_size(.data)
if (is.null(min) & is.null(max)) {
stop('`min` and `max` cannot both be NULL.')
}
if (is.null(max)) {
max <- base::max(g, na.rm = TRUE)
}
ind <- base::rep(g >= min & g <= max, g)
.data[ind, ]
}
Let's check it for a minimal group size of 5
:
dat2 %>%
group_by(cat) %>%
filter_group_size(5, NULL) %>%
summarise(n = n()) %>%
arrange(desc(n))
# # A tibble: 6,634 x 2
# cat n
# <int> <int>
# 1 NA 19
# 2 1 5
# 3 2 5
# 4 6 5
# 5 15 5
# 6 17 5
# 7 21 5
# 8 27 5
# 9 33 5
# 10 37 5
# # ... with 6,624 more rows
Great, now check for the OP's question; a group size of exactly 5
:
dat2 %>%
group_by(cat) %>%
filter_group_size(5) %>%
summarise(n = n()) %>%
pull(n) %>%
unique()
# [1] 5
Hooray.