How do I count groups of 3 tied to a name in a data frame?
Say I have the following dataset:
library(tidyr)
library(dplyr)
name1 <- c("John", "John", "John", "John", "John", "John", "John", "John", "John", "John", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary","Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "David", "David", "David", "David", "David", "David", "David", "David", "David", "David")
name2 <- c("Mary", "Thomas", "Linda", "David", "Joe", "Carl", "Joel", "Victoria", "Elaine", "Cory", "June", "John", "Linda", "David", "Joe", "Holly", "Michael", "Dwight", "Jim", "Andy", "Mary", "John", "Linda", "David", "Joe", "Helen", "Lauren", "Bill", "Saul", "Ben", "Mary", "John", "Linda", "David", "Robert", "Holly", "Michael", "James", "Renee", "Sally", "Mary", "John", "Linda", "Paul", "Joe", "Peter", "Clark", "Elaine", "Cory", "Victoria")
df <- data.frame(name1, name2)
I want to be able to count the number of times each value in "name1" has the same 3 values in "name2". For example, John (name1) has Linda, David, and Joe (name2) and Mary (name 1) also has Linda, David, and Joe (name2). So if we only to look at John and Mary, the number of times all 3 names are in one group is 2. I'd like to produce a table that shows the number of times three names appears in name2 for a name in name1.
I know how I would write the function for pairs:
count_pairs <-
function(df) {
df %>%
apply(1, sort) %>%
t() %>%
data.frame() %>%
group_by_all() %>%
count(name = "Occurrences_Pair")
}
df_pairs <- df %>% count_pairs()
But how would I do it for groups of 3, 4, 5, etc.?
Here's an approach with dplyr
. This says, for instance, that John and Mary share 3 names, while John and David share 6.
First I join all rows with other rows matching on name2
, then filter out matching name1
(all will match 100% with themselves), keep distinct matches, and count how many between each name1 and another.
left_join(mutate(df, val = 1),
mutate(df, val = 1), by = c("val", "name2")) %>%
filter(name1.x != name1.y) %>%
distinct(name2, name1.x, name1.y) %>%
count(name1.x, name1.y) %>%
arrange(n)
Result
name1.x name1.y n
1 David Joe 3
2 David Mary 3
3 Joe David 3
4 Joe John 3
5 John Joe 3
6 John Mary 3
7 Mary David 3
8 Mary John 3
9 Anne David 4
10 Anne Joe 4
11 Anne John 4
12 Anne Mary 4
13 David Anne 4
14 Joe Anne 4
15 John Anne 4
16 Mary Anne 4
17 Joe Mary 5
18 Mary Joe 5
19 David John 6
20 John David 6
or replace the last line with the lines below to produce a coincidence table:
complete(name1.x, name1.y, fill = list(n = 0)) %>% # skip if order doesn't matter
pivot_wider(names_from = name1.y, values_from = n, values_fill = 0)
# A tibble: 5 x 6
name1.x Anne David Joe John Mary
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Anne 0 4 4 4 4
2 David 4 0 3 6 3
3 Joe 4 3 0 3 5
4 John 4 6 3 0 3
5 Mary 4 3 5 3 0
count_groups_of <- function(df, n) {
varname <- as.symbol(paste0('combs_of_', n))
df %>%
group_by(name1) %>%
summarise(!!varname := combn(name2, n, function(x) toString(sort(x))), .groups = 'drop') %>%
group_by(!!varname) %>%
summarise(n = n(), which_name1 = toString(name1), .groups = 'drop') %>%
arrange(-n)
}
library(dplyr, warn.conflicts = FALSE)
df %>%
count_groups_of(3)
#> # A tibble: 556 × 3
#> combs_of_3 n which_name1
#> <chr> <int> <chr>
#> 1 David, Joe, Linda 3 Anne, John, Mary
#> 2 David, John, Linda 3 Anne, Joe, Mary
#> 3 David, Linda, Mary 3 Anne, Joe, John
#> 4 Joe, John, Linda 3 Anne, David, Mary
#> 5 Joe, Linda, Mary 3 Anne, David, John
#> 6 John, Linda, Mary 3 Anne, David, Joe
#> 7 Cory, Elaine, Joe 2 David, John
#> 8 Cory, Elaine, Linda 2 David, John
#> 9 Cory, Elaine, Mary 2 David, John
#> 10 Cory, Elaine, Victoria 2 David, John
#> # … with 546 more rows
df %>%
count_groups_of(4)
#> # A tibble: 1,026 × 3
#> combs_of_4 n which_name1
#> <chr> <int> <chr>
#> 1 Cory, Elaine, Joe, Linda 2 David, John
#> 2 Cory, Elaine, Joe, Mary 2 David, John
#> 3 Cory, Elaine, Joe, Victoria 2 David, John
#> 4 Cory, Elaine, Linda, Mary 2 David, John
#> 5 Cory, Elaine, Linda, Victoria 2 David, John
#> 6 Cory, Elaine, Mary, Victoria 2 David, John
#> 7 Cory, Joe, Linda, Mary 2 David, John
#> 8 Cory, Joe, Linda, Victoria 2 David, John
#> 9 Cory, Joe, Mary, Victoria 2 David, John
#> 10 Cory, Linda, Mary, Victoria 2 David, John
#> # … with 1,016 more rows
Created on 2022-01-10 by the reprex package (v2.0.1)
Here comes a fast solution using RcppAlgos::comboGeneral
.
nm <- sort(unique(df$name2)) ## unique names
M <- matrix(df$name2, 10) ## make a matrix, nrow acc. to name 1 groups
f <- \(n) RcppAlgos::comboGeneral(
nm, n, FUN=\(x) c(x, n=sum(colSums(array(M %in% x, dim=dim(M))) == n))) |>
do.call(what=rbind) |> as.data.frame() |> type.convert(as.is=TRUE) |>
(\(.) .[order(-.$n), ])() |> `rownames<-`(NULL)
Gives
head(f(2))
# V1 V2 n
# 1 David Linda 4
# 2 Joe Linda 4
# 3 John Linda 4
# 4 Linda Mary 4
# 5 David Joe 3
# 6 David John 3
head(f(3))
# V1 V2 V3 n
# 1 David Joe Linda 3
# 2 David John Linda 3
# 3 David Linda Mary 3
# 4 Joe John Linda 3
# 5 Joe Linda Mary 3
# 6 John Linda Mary 3
head(f(4))
# V1 V2 V3 V4 n
# 1 Cory Elaine Joe Linda 2
# 2 Cory Elaine Joe Mary 2
# 3 Cory Elaine Joe Victoria 2
# 4 Cory Elaine Linda Mary 2
# 5 Cory Elaine Linda Victoria 2
# 6 Cory Elaine Mary Victoria 2
head(f(5))
# V1 V2 V3 V4 V5 n
# 1 Cory Elaine Joe Linda Mary 2
# 2 Cory Elaine Joe Linda Victoria 2
# 3 Cory Elaine Joe Mary Victoria 2
# 4 Cory Elaine Linda Mary Victoria 2
# 5 Cory Joe Linda Mary Victoria 2
# 6 David Holly John Linda Michael 2
head(f(6))
# V1 V2 V3 V4 V5 V6 n
# 1 Cory Elaine Joe Linda Mary Victoria 2
# 2 Andy David Dwight Holly Jim Joe 1
# 3 Andy David Dwight Holly Jim John 1
# 4 Andy David Dwight Holly Jim June 1
# 5 Andy David Dwight Holly Jim Linda 1
# 6 Andy David Dwight Holly Jim Michael 1
Note: R >= 4.1 used.
Data:
df <- structure(list(name1 = c("John", "John", "John", "John", "John",
"John", "John", "John", "John", "John", "Mary", "Mary", "Mary",
"Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Mary", "Anne",
"Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne", "Anne",
"Anne", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe",
"Joe", "Joe", "David", "David", "David", "David", "David", "David",
"David", "David", "David", "David"), name2 = c("Mary", "Thomas",
"Linda", "David", "Joe", "Carl", "Joel", "Victoria", "Elaine",
"Cory", "June", "John", "Linda", "David", "Joe", "Holly", "Michael",
"Dwight", "Jim", "Andy", "Mary", "John", "Linda", "David", "Joe",
"Helen", "Lauren", "Bill", "Saul", "Ben", "Mary", "John", "Linda",
"David", "Robert", "Holly", "Michael", "James", "Renee", "Sally",
"Mary", "John", "Linda", "Paul", "Joe", "Peter", "Clark", "Elaine",
"Cory", "Victoria")), class = "data.frame", row.names = c(NA,
-50L))