How do I count groups of 3 tied to a name in a data frame?

Say I have the following dataset:

library(tidyr)
library(dplyr)

name1 <- c("John", "John", "John", "John", "John", "John", "John", "John", "John", "John", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary","Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Joe", "Joe", "Joe", "Joe", "Joe",  "Joe", "Joe", "Joe", "Joe", "Joe", "David", "David", "David", "David", "David", "David", "David", "David", "David", "David")

name2 <- c("Mary", "Thomas", "Linda", "David", "Joe", "Carl", "Joel", "Victoria", "Elaine", "Cory", "June", "John", "Linda", "David", "Joe", "Holly", "Michael", "Dwight", "Jim", "Andy", "Mary", "John", "Linda", "David", "Joe", "Helen", "Lauren", "Bill", "Saul", "Ben", "Mary", "John", "Linda", "David", "Robert", "Holly", "Michael", "James", "Renee", "Sally", "Mary", "John", "Linda", "Paul", "Joe", "Peter", "Clark", "Elaine", "Cory", "Victoria")

df <- data.frame(name1, name2)

I want to be able to count the number of times each value in "name1" has the same 3 values in "name2". For example, John (name1) has Linda, David, and Joe (name2) and Mary (name 1) also has Linda, David, and Joe (name2). So if we only to look at John and Mary, the number of times all 3 names are in one group is 2. I'd like to produce a table that shows the number of times three names appears in name2 for a name in name1.

I know how I would write the function for pairs:

count_pairs <- 
  function(df) {
    df %>%
      apply(1, sort) %>%
      t() %>%
      data.frame() %>%
      group_by_all() %>%
      count(name = "Occurrences_Pair")
  }

df_pairs <- df %>% count_pairs()

But how would I do it for groups of 3, 4, 5, etc.?


Here's an approach with dplyr. This says, for instance, that John and Mary share 3 names, while John and David share 6.

First I join all rows with other rows matching on name2, then filter out matching name1 (all will match 100% with themselves), keep distinct matches, and count how many between each name1 and another.

left_join(mutate(df, val = 1),
          mutate(df, val = 1), by = c("val", "name2")) %>%
  filter(name1.x != name1.y) %>%
  distinct(name2, name1.x, name1.y) %>%
  count(name1.x, name1.y) %>%
  arrange(n)

Result

   name1.x name1.y n
1    David     Joe 3
2    David    Mary 3
3      Joe   David 3
4      Joe    John 3
5     John     Joe 3
6     John    Mary 3
7     Mary   David 3
8     Mary    John 3
9     Anne   David 4
10    Anne     Joe 4
11    Anne    John 4
12    Anne    Mary 4
13   David    Anne 4
14     Joe    Anne 4
15    John    Anne 4
16    Mary    Anne 4
17     Joe    Mary 5
18    Mary     Joe 5
19   David    John 6
20    John   David 6

or replace the last line with the lines below to produce a coincidence table:

complete(name1.x, name1.y, fill = list(n = 0)) %>%  # skip if order doesn't matter
pivot_wider(names_from = name1.y, values_from = n, values_fill = 0)

# A tibble: 5 x 6
  name1.x  Anne David   Joe  John  Mary
  <chr>   <dbl> <dbl> <dbl> <dbl> <dbl>
1 Anne        0     4     4     4     4
2 David       4     0     3     6     3
3 Joe         4     3     0     3     5
4 John        4     6     3     0     3
5 Mary        4     3     5     3     0

count_groups_of <- function(df, n) {
  varname <- as.symbol(paste0('combs_of_', n))
  df %>% 
    group_by(name1) %>% 
    summarise(!!varname := combn(name2, n, function(x) toString(sort(x))), .groups = 'drop') %>% 
    group_by(!!varname) %>% 
    summarise(n = n(), which_name1 = toString(name1), .groups = 'drop') %>% 
    arrange(-n)
}

library(dplyr, warn.conflicts = FALSE)

df %>% 
  count_groups_of(3)
#> # A tibble: 556 × 3
#>    combs_of_3                 n which_name1      
#>    <chr>                  <int> <chr>            
#>  1 David, Joe, Linda          3 Anne, John, Mary 
#>  2 David, John, Linda         3 Anne, Joe, Mary  
#>  3 David, Linda, Mary         3 Anne, Joe, John  
#>  4 Joe, John, Linda           3 Anne, David, Mary
#>  5 Joe, Linda, Mary           3 Anne, David, John
#>  6 John, Linda, Mary          3 Anne, David, Joe 
#>  7 Cory, Elaine, Joe          2 David, John      
#>  8 Cory, Elaine, Linda        2 David, John      
#>  9 Cory, Elaine, Mary         2 David, John      
#> 10 Cory, Elaine, Victoria     2 David, John      
#> # … with 546 more rows

df %>% 
  count_groups_of(4)
#> # A tibble: 1,026 × 3
#>    combs_of_4                        n which_name1
#>    <chr>                         <int> <chr>      
#>  1 Cory, Elaine, Joe, Linda          2 David, John
#>  2 Cory, Elaine, Joe, Mary           2 David, John
#>  3 Cory, Elaine, Joe, Victoria       2 David, John
#>  4 Cory, Elaine, Linda, Mary         2 David, John
#>  5 Cory, Elaine, Linda, Victoria     2 David, John
#>  6 Cory, Elaine, Mary, Victoria      2 David, John
#>  7 Cory, Joe, Linda, Mary            2 David, John
#>  8 Cory, Joe, Linda, Victoria        2 David, John
#>  9 Cory, Joe, Mary, Victoria         2 David, John
#> 10 Cory, Linda, Mary, Victoria       2 David, John
#> # … with 1,016 more rows

Created on 2022-01-10 by the reprex package (v2.0.1)


Here comes a fast solution using RcppAlgos::comboGeneral.

nm <- sort(unique(df$name2)) ## unique names
M <- matrix(df$name2, 10)  ## make a matrix, nrow acc. to name 1 groups

f <- \(n) RcppAlgos::comboGeneral(
  nm, n, FUN=\(x) c(x, n=sum(colSums(array(M %in% x, dim=dim(M))) == n))) |>
  do.call(what=rbind) |> as.data.frame() |> type.convert(as.is=TRUE) |>
  (\(.) .[order(-.$n), ])() |> `rownames<-`(NULL)

Gives

head(f(2))
#      V1    V2 n
# 1 David Linda 4
# 2   Joe Linda 4
# 3  John Linda 4
# 4 Linda  Mary 4
# 5 David   Joe 3
# 6 David  John 3
head(f(3))
#      V1    V2    V3 n
# 1 David   Joe Linda 3
# 2 David  John Linda 3
# 3 David Linda  Mary 3
# 4   Joe  John Linda 3
# 5   Joe Linda  Mary 3
# 6  John Linda  Mary 3
head(f(4))
#     V1     V2    V3       V4 n
# 1 Cory Elaine   Joe    Linda 2
# 2 Cory Elaine   Joe     Mary 2
# 3 Cory Elaine   Joe Victoria 2
# 4 Cory Elaine Linda     Mary 2
# 5 Cory Elaine Linda Victoria 2
# 6 Cory Elaine  Mary Victoria 2
head(f(5))
#      V1     V2    V3    V4       V5 n
# 1  Cory Elaine   Joe Linda     Mary 2
# 2  Cory Elaine   Joe Linda Victoria 2
# 3  Cory Elaine   Joe  Mary Victoria 2
# 4  Cory Elaine Linda  Mary Victoria 2
# 5  Cory    Joe Linda  Mary Victoria 2
# 6 David  Holly  John Linda  Michael 2
head(f(6))
#     V1     V2     V3    V4   V5       V6 n
# 1 Cory Elaine    Joe Linda Mary Victoria 2
# 2 Andy  David Dwight Holly  Jim      Joe 1
# 3 Andy  David Dwight Holly  Jim     John 1
# 4 Andy  David Dwight Holly  Jim     June 1
# 5 Andy  David Dwight Holly  Jim    Linda 1
# 6 Andy  David Dwight Holly  Jim  Michael 1

Note: R >= 4.1 used.


Data:

df <- structure(list(name1 = c("John", "John", "John", "John", "John", 
"John", "John", "John", "John", "John", "Mary", "Mary", "Mary", 
"Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Anne", 
"Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", 
"Anne", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", 
"Joe", "Joe", "David", "David", "David", "David", "David", "David", 
"David", "David", "David", "David"), name2 = c("Mary", "Thomas", 
"Linda", "David", "Joe", "Carl", "Joel", "Victoria", "Elaine", 
"Cory", "June", "John", "Linda", "David", "Joe", "Holly", "Michael", 
"Dwight", "Jim", "Andy", "Mary", "John", "Linda", "David", "Joe", 
"Helen", "Lauren", "Bill", "Saul", "Ben", "Mary", "John", "Linda", 
"David", "Robert", "Holly", "Michael", "James", "Renee", "Sally", 
"Mary", "John", "Linda", "Paul", "Joe", "Peter", "Clark", "Elaine", 
"Cory", "Victoria")), class = "data.frame", row.names = c(NA, 
-50L))