Merge two dataframe with 2 columns (comma separated content) in R

I have a df such as :

df1

Nb Groups_subgroups 
A  1_Cluster17972,1_Cluster2653,1_Cluster3900,2_Cluster12159,1_Cluster1798,1_Cluster17493
B  7_Cluster13022,1_Cluster4976
C  2_Cluster27505
D  7_Cluster25561,23_Cluster25561
E  1_Cluster23500

And another df2 such as :

Groups       Subgroups 
Cluster17972 1
Cluster17972 2 
Cluster2653  1
Cluster3900  1
Cluster12159 2
Cluster12159 3
Cluster1798  1
Cluster17493 1
Cluster00001 1
Cluster00001 2
Cluster13022 7
Cluster4976  1
Cluster27505 2
Cluster25561 7
Cluster25561 23
Cluster23500 1

And I would like to merge df1$Groups_subgroups with df2$Groups and df2$Subgroups columns and get a result such as :

Groups       Subgroups Nb
Cluster17972 1         A
Cluster17972 2         NA
Cluster2653  1         A
Cluster3900  1         A
Cluster12159 2         A
Cluster12159 3         NA
Cluster1798  1         A
Cluster17493 1         A
Cluster00001 1         NA
Cluster00001 2         NA
Cluster13022 7         B
Cluster4976  1         B
Cluster27505 2         C
Cluster25561 7         D
Cluster25561 23        D
Cluster23500 1         E

Does someone have an idea ? Here are the two df in dput format :

df1

structure(list(Nb = c("A", "B", "C", "D", "E"), Groups_subgroups = c("1_Cluster17972,1_Cluster2653,1_Cluster3900,2_Cluster12159,1_Cluster1798,1_Cluster17493", 
"7_Cluster13022,1_Cluster4976", "2_Cluster27505", "7_Cluster25561,23_Cluster25561", 
"1_Cluster23500")), class = "data.frame", row.names = c(NA, -5L
))

df2

structure(list(Groups = c("Cluster17972", "Cluster17972", "Cluster2653", 
"Cluster3900", "Cluster12159", "Cluster12159", "Cluster1798", 
"Cluster17493", "Cluster00001", "Cluster00001", "Cluster13022", 
"Cluster4976", "Cluster27505", "Cluster25561", "Cluster25561", 
"Cluster23500"), Subgroups = c(1, 2, 1, 1, 2, 3, 1, 1, 1, 2, 
7, 1, 2, 7, 23, 1)), class = "data.frame", row.names = c(NA, 
-16L))

Solution 1:

dplyr

library(dplyr)
library(tidyr) # unnest, separate
df1 %>%
  mutate(Groups_subgroups = strsplit(Groups_subgroups, "[ ,]+")) %>%
  unnest(Groups_subgroups) %>%
  separate(Groups_subgroups, into = c("Subgroups", "Groups")) %>%
  mutate(Subgroups = as.integer(Subgroups)) %>%
  left_join(df2, ., by = c("Groups", "Subgroups"))
#          Groups Subgroups   Nb
# 1  Cluster17972         1    A
# 2  Cluster17972         2 <NA>
# 3   Cluster2653         1    A
# 4   Cluster3900         1    A
# 5  Cluster12159         2    A
# 6  Cluster12159         3 <NA>
# 7   Cluster1798         1    A
# 8  Cluster17493         1    A
# 9  Cluster00001         1 <NA>
# 10 Cluster00001         2 <NA>
# 11 Cluster13022         7    B
# 12  Cluster4976         1    B
# 13 Cluster27505         2    C
# 14 Cluster25561         7    D
# 15 Cluster25561        23    D
# 16 Cluster23500         1    E

base R

subgr <- strsplit(df1$Groups_subgroups, "[ ,]+")
subgr
# [[1]]
# [1] "1_Cluster17972" "1_Cluster2653"  "1_Cluster3900"  "2_Cluster12159" "1_Cluster1798"  "1_Cluster17493"
# [[2]]
# [1] "7_Cluster13022" "1_Cluster4976" 
# [[3]]
# [1] "2_Cluster27505"
# [[4]]
# [1] "7_Cluster25561"  "23_Cluster25561"
# [[5]]
# [1] "1_Cluster23500"
groups <- data.frame(Nb = rep(df1$Nb, lengths(subgr)), GSG = unlist(subgr))
groups <- cbind(groups, strcapture("^([^_]+)_(.*)", groups$GSG, list(Subgroups = 0L, Groups = "")))
groups <- groups[,-2]
groups
#    Nb Subgroups       Groups
# 1   A         1 Cluster17972
# 2   A         1  Cluster2653
# 3   A         1  Cluster3900
# 4   A         2 Cluster12159
# 5   A         1  Cluster1798
# 6   A         1 Cluster17493
# 7   B         7 Cluster13022
# 8   B         1  Cluster4976
# 9   C         2 Cluster27505
# 10  D         7 Cluster25561
# 11  D        23 Cluster25561
# 12  E         1 Cluster23500

With this, we just merge/join the two objects together:

merge(df2, groups, by = c("Groups", "Subgroups"), all.x = TRUE)
#          Groups Subgroups   Nb
# 1  Cluster00001         1 <NA>
# 2  Cluster00001         2 <NA>
# 3  Cluster12159         2    A
# 4  Cluster12159         3 <NA>
# 5  Cluster13022         7    B
# 6  Cluster17493         1    A
# 7  Cluster17972         1    A
# 8  Cluster17972         2 <NA>
# 9   Cluster1798         1    A
# 10 Cluster23500         1    E
# 11 Cluster25561         7    D
# 12 Cluster25561        23    D
# 13  Cluster2653         1    A
# 14 Cluster27505         2    C
# 15  Cluster3900         1    A
# 16  Cluster4976         1    B

Solution 2:

First we could prepare df1 with separate_rows and separate and then join with Groups AND Subgroups:

library(dplyr)
Library(tidyr)

df1 <- df1 %>% 
  separate_rows(Groups_subgroups, sep = ',') %>% 
  separate(Groups_subgroups, c("Subgroups", "Groups")) %>% 
  type.convert(as.is = TRUE) 

  
df2 %>% 
  left_join(df1, by=c("Groups", "Subgroups"))
         Groups Subgroups   Nb
1  Cluster17972         1    A
2  Cluster17972         2 <NA>
3   Cluster2653         1    A
4   Cluster3900         1    A
5  Cluster12159         2    A
6  Cluster12159         3 <NA>
7   Cluster1798         1    A
8  Cluster17493         1    A
9  Cluster00001         1 <NA>
10 Cluster00001         2 <NA>
11 Cluster13022         7    B
12  Cluster4976         1    B
13 Cluster27505         2    C
14 Cluster25561         7    D
15 Cluster25561        23    D
16 Cluster23500         1    E