Find all parent ids that are associated with two distinct child ids in r

I am trying to find a filtering method in R that can work quickly with large datasets. I have data that has a Parent_ID. Each Parent_ID can have multiple rows, each with a Child_ID.

I have data that I want to find all the Parent_IDs that are associated with both Child_ID = 1 and Child_ID = 2 like:

      parent_id child_id
 [1,]         1        1
 [2,]         1        1
 [3,]         1        2
 [4,]         2        1
 [5,]         2        2
 [6,]         3        3
 [7,]         3        1
 [8,]         3        4
 [9,]         4        4
[10,]         4        2

In the above example, it should return parent_ids 1,2 since those are the two Parent_IDs that have both child_id = 1 and child_id = 2.

parent_id = 3 is not returned because it has child_id=1 but not child_id=2. parent_id=4 is not returned because it has child_id=2 but not child_id=1.

My current method is to loop through each parent_id and then search for the child_ids but I'm looking for something faster.

# define the child_ids we're looking for
wanted <- c(1,2)
# find the unique parent ids
uniq_parents <- unique(df$parent_id)
# loop through each parent_id
match <- vector()
for(i in 1:length(uniq_parents))
{
  one_parent <- df[df$parent_id == uniq_parents[i],]
  found <- wanted%in%one_parent$child_id
  if(sum(found) == length(wanted))
  {
    match <- c(match,one_parent$parent_id)
  }
}
match <- unique(match)

> match
[1] 1 2

Solution 1:

intersect(df[df$child_id == 1, 'parent_id'], df[df$child_id == 2, 'parent_id'])

You want the parent_ids which have both "1" and "2" childs, so the intersection of those which have "1" children and those which have "2" chidlren.

Here's another implementation to generalize it to any number of children:

find_parent_ids_for_children <- function(df, child_ids) {
  df <- unique(df)
  df <- df[df$child_id %in% child_ids, , drop = FALSE]
  pids <- rle(sort(df$parent_id))
  
  pids$values[pids$lengths == length(child_ids)]
}

df0 <- data.frame(parent_id=c(1,1,1,2,2,3,3,3,4,4), child_id=c(1,1,2,1,2,3,1,4,4,2))

find_parent_ids_for_children(df0, 1:2)
# [1] 1 2

find_parent_ids_for_children(df0, 1:3)
# numeric(0)

find_parent_ids_for_children(df0, 2:3)
# numeric(0)
find_parent_ids_for_children(df0, c(1, 3))
# [1] 3

Solution 2:

I will also add an answer in dplyr, which is great for filtering etc.

# create data frame
dat <- data.frame(parent_id = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4), child_id = c(1, 1, 2, 1, 2, 3, 1, 4, 4, 2))

# filter the table for the criteria you presented
result_parents <- dat %>%
  group_by(parent_id) %>%
  filter(any(child_id == 1) & any(child_id == 2))
    
# if you really only want the parent_id which match your criteria          
unique(result_parents$parent_id)

Solution 3:

Here are data.table and dplyr solutions. I'll be interested to see how they perform on larger data.

library(dplyr)
df %>%
  group_by(parent_id) %>%
  filter(all(wanted %in% child_id)) %>%
  summarize(cur_group()) %>%
  pull(parent_id)
library(data.table)
dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id

Using this sample data:

df = read.table(text = '     parent_id child_id
 [1,]         1        1
 [2,]         1        1
 [3,]         1        2
 [4,]         2        1
 [5,]         2        2
 [6,]         3        3
 [7,]         3        1
 [8,]         3        4
 [9,]         4        4
[10,]         4        2', header =T)

wanted <- c(1,2)

benchmarking

I ran a bench mark, which I found quite surprising. If you've jsut got 2 child IDs in wanted, the intersect method is by far the fastest, most memory efficient, and simplest code.

# A tibble: 5 × 13
  expression       min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
  <bch:expr>     <dbl>   <dbl>     <dbl> <bch:byt>    <dbl> <int> <dbl>      <dbl>
1 intersect      0.333   0.373   1418.      1.54MB    27.7    716    14       505.
2 dplyr_em      12.3    12.8       74.2      3.1MB    11.7     38     6       512.
3 dplyr_gregor  12.7    13.2       65.3     5.05MB     9.89    33     5       505.
4 rle           46.1    54.4       18.6     9.87MB    13.0     10     7       538.
5 data.table   213.    215.         4.64   83.26MB    10.8      3     7       647.
# … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>

Benchmarking code:

set.seed(47)
n = 1e5
df = data.frame(parent_id = sample(1:(n / 20), size = n, replace = T), child_id = sample(1:(n / 500), size = n, replace = T))
df = arrange(df, parent_id, child_id) 

wanted = c(1, 2)

dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id

find_parent_ids_for_children <- function(df, child_ids) {
  df <- unique(df)
  df <- df[df$child_id %in% child_ids, , drop = FALSE]
  pids <- rle(sort(df$parent_id))
  
  pids$values[pids$lengths == length(child_ids)]
}


bench::mark(
  intersect = intersect(df[df$child_id == wanted[1], 'parent_id'], df[df$child_id == wanted[2], 'parent_id']),
  rle = find_parent_ids_for_children(df, wanted),
  dplyr_gregor = {
    df %>%
      group_by(parent_id) %>%
      filter(all(wanted %in% child_id)) %>%
      pull(parent_id) %>%
      unique()
  },
  data.table = dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id,
  dplyr_em = {
    df %>%
      group_by(parent_id) %>%
      filter(any(child_id == wanted[1]) & any(child_id == wanted[2])) %>%
      pull(parent_id) %>%
      unique()
  },
  time_unit = "ms"
) %>% arrange(median)