Find all parent ids that are associated with two distinct child ids in r
I am trying to find a filtering method in R that can work quickly with large datasets. I have data that has a Parent_ID
. Each Parent_ID
can have multiple rows, each with a Child_ID
.
I have data that I want to find all the Parent_IDs
that are associated with both Child_ID = 1
and Child_ID = 2
like:
parent_id child_id
[1,] 1 1
[2,] 1 1
[3,] 1 2
[4,] 2 1
[5,] 2 2
[6,] 3 3
[7,] 3 1
[8,] 3 4
[9,] 4 4
[10,] 4 2
In the above example, it should return parent_ids 1,2
since those are the two Parent_IDs
that have both child_id = 1
and child_id = 2
.
parent_id = 3
is not returned because it has child_id=1
but not child_id=2
. parent_id=4
is not returned because it has child_id=2
but not child_id=1
.
My current method is to loop through each parent_id
and then search for the child_ids
but I'm looking for something faster.
# define the child_ids we're looking for
wanted <- c(1,2)
# find the unique parent ids
uniq_parents <- unique(df$parent_id)
# loop through each parent_id
match <- vector()
for(i in 1:length(uniq_parents))
{
one_parent <- df[df$parent_id == uniq_parents[i],]
found <- wanted%in%one_parent$child_id
if(sum(found) == length(wanted))
{
match <- c(match,one_parent$parent_id)
}
}
match <- unique(match)
> match
[1] 1 2
Solution 1:
intersect(df[df$child_id == 1, 'parent_id'], df[df$child_id == 2, 'parent_id'])
You want the parent_ids which have both "1" and "2" childs, so the intersection of those which have "1" children and those which have "2" chidlren.
Here's another implementation to generalize it to any number of children:
find_parent_ids_for_children <- function(df, child_ids) {
df <- unique(df)
df <- df[df$child_id %in% child_ids, , drop = FALSE]
pids <- rle(sort(df$parent_id))
pids$values[pids$lengths == length(child_ids)]
}
df0 <- data.frame(parent_id=c(1,1,1,2,2,3,3,3,4,4), child_id=c(1,1,2,1,2,3,1,4,4,2))
find_parent_ids_for_children(df0, 1:2)
# [1] 1 2
find_parent_ids_for_children(df0, 1:3)
# numeric(0)
find_parent_ids_for_children(df0, 2:3)
# numeric(0)
find_parent_ids_for_children(df0, c(1, 3))
# [1] 3
Solution 2:
I will also add an answer in dplyr, which is great for filtering etc.
# create data frame
dat <- data.frame(parent_id = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4), child_id = c(1, 1, 2, 1, 2, 3, 1, 4, 4, 2))
# filter the table for the criteria you presented
result_parents <- dat %>%
group_by(parent_id) %>%
filter(any(child_id == 1) & any(child_id == 2))
# if you really only want the parent_id which match your criteria
unique(result_parents$parent_id)
Solution 3:
Here are data.table
and dplyr
solutions. I'll be interested to see how they perform on larger data.
library(dplyr)
df %>%
group_by(parent_id) %>%
filter(all(wanted %in% child_id)) %>%
summarize(cur_group()) %>%
pull(parent_id)
library(data.table)
dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id
Using this sample data:
df = read.table(text = ' parent_id child_id
[1,] 1 1
[2,] 1 1
[3,] 1 2
[4,] 2 1
[5,] 2 2
[6,] 3 3
[7,] 3 1
[8,] 3 4
[9,] 4 4
[10,] 4 2', header =T)
wanted <- c(1,2)
benchmarking
I ran a bench mark, which I found quite surprising. If you've jsut got 2 child IDs in wanted
, the intersect
method is by far the fastest, most memory efficient, and simplest code.
# A tibble: 5 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <dbl> <dbl> <dbl> <bch:byt> <dbl> <int> <dbl> <dbl>
1 intersect 0.333 0.373 1418. 1.54MB 27.7 716 14 505.
2 dplyr_em 12.3 12.8 74.2 3.1MB 11.7 38 6 512.
3 dplyr_gregor 12.7 13.2 65.3 5.05MB 9.89 33 5 505.
4 rle 46.1 54.4 18.6 9.87MB 13.0 10 7 538.
5 data.table 213. 215. 4.64 83.26MB 10.8 3 7 647.
# … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>
Benchmarking code:
set.seed(47)
n = 1e5
df = data.frame(parent_id = sample(1:(n / 20), size = n, replace = T), child_id = sample(1:(n / 500), size = n, replace = T))
df = arrange(df, parent_id, child_id)
wanted = c(1, 2)
dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id
find_parent_ids_for_children <- function(df, child_ids) {
df <- unique(df)
df <- df[df$child_id %in% child_ids, , drop = FALSE]
pids <- rle(sort(df$parent_id))
pids$values[pids$lengths == length(child_ids)]
}
bench::mark(
intersect = intersect(df[df$child_id == wanted[1], 'parent_id'], df[df$child_id == wanted[2], 'parent_id']),
rle = find_parent_ids_for_children(df, wanted),
dplyr_gregor = {
df %>%
group_by(parent_id) %>%
filter(all(wanted %in% child_id)) %>%
pull(parent_id) %>%
unique()
},
data.table = dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id,
dplyr_em = {
df %>%
group_by(parent_id) %>%
filter(any(child_id == wanted[1]) & any(child_id == wanted[2])) %>%
pull(parent_id) %>%
unique()
},
time_unit = "ms"
) %>% arrange(median)