Stratified random sampling from data frame

Solution 1:

I would suggest using either stratified from my "splitstackshape" package, or sample_n from the "dplyr" package:

## Sample data
set.seed(1)
n <- 1e4
d <- data.table(age = sample(1:5, n, T), 
                lc = rbinom(n, 1 , .5),
                ants = rbinom(n, 1, .7))
# table(d$age, d$lc)

For stratified, you basically specify the dataset, the stratifying columns, and an integer representing the size you want from each group OR a decimal representing the fraction you want returned (for example, .1 represents 10% from each group).

library(splitstackshape)
set.seed(1)
out <- stratified(d, c("age", "lc"), 30)
head(out)
#    age lc ants
# 1:   1  0    1
# 2:   1  0    0
# 3:   1  0    1
# 4:   1  0    1
# 5:   1  0    0
# 6:   1  0    1

table(out$age, out$lc)
#    
#      0  1
#   1 30 30
#   2 30 30
#   3 30 30
#   4 30 30
#   5 30 30

For sample_n you first create a grouped table (using group_by) and then specify the number of observations you want. If you wanted proportional sampling instead, you should use sample_frac.

library(dplyr)
set.seed(1)
out2 <- d %>%
  group_by(age, lc) %>%
  sample_n(30)

# table(out2$age, out2$lc)

Solution 2:

See the function strata from the package sampling. The function selects stratified simple random sampling and gives a sample as a result. Extra two columns are added - inclusion probabilities (Prob) and strata indicator (Stratum). See the example.

require(data.table)
require(sampling)

set.seed(1)
n <- 1e4
d <- data.table(age = sample(1:5, n, T), 
                lc = rbinom(n, 1 , .5),
                ants = rbinom(n, 1, .7))

# Sort
setkey(d, age, lc)

# Population size by strata
d[, .N, keyby = list(age, lc)]
#     age lc    N
#  1:   1  0 1010
#  2:   1  1 1002
#  3:   2  0  993
#  4:   2  1 1026
#  5:   3  0 1021
#  6:   3  1  982
#  7:   4  0  958
#  8:   4  1  940
#  9:   5  0 1012
# 10:   5  1 1056

# Select sample
set.seed(2)
s <- data.table(strata(d, c("age", "lc"), rep(30, 10), "srswor"))

# Sample size by strata
s[, .N, keyby = list(age, lc)]
#     age lc  N
#  1:   1  0 30
#  2:   1  1 30
#  3:   2  0 30
#  4:   2  1 30
#  5:   3  0 30
#  6:   3  1 30
#  7:   4  0 30
#  8:   4  1 30
#  9:   5  0 30
# 10:   5  1 30