Merge two dataframe with 2 columns (comma separated content) in R
I have a df such as :
df1
Nb Groups_subgroups
A 1_Cluster17972,1_Cluster2653,1_Cluster3900,2_Cluster12159,1_Cluster1798,1_Cluster17493
B 7_Cluster13022,1_Cluster4976
C 2_Cluster27505
D 7_Cluster25561,23_Cluster25561
E 1_Cluster23500
And another df2 such as :
Groups Subgroups
Cluster17972 1
Cluster17972 2
Cluster2653 1
Cluster3900 1
Cluster12159 2
Cluster12159 3
Cluster1798 1
Cluster17493 1
Cluster00001 1
Cluster00001 2
Cluster13022 7
Cluster4976 1
Cluster27505 2
Cluster25561 7
Cluster25561 23
Cluster23500 1
And I would like to merge df1$Groups_subgroups
with df2$Groups
and df2$Subgroups
columns and get a result such as :
Groups Subgroups Nb
Cluster17972 1 A
Cluster17972 2 NA
Cluster2653 1 A
Cluster3900 1 A
Cluster12159 2 A
Cluster12159 3 NA
Cluster1798 1 A
Cluster17493 1 A
Cluster00001 1 NA
Cluster00001 2 NA
Cluster13022 7 B
Cluster4976 1 B
Cluster27505 2 C
Cluster25561 7 D
Cluster25561 23 D
Cluster23500 1 E
Does someone have an idea ? Here are the two df in dput format :
df1
structure(list(Nb = c("A", "B", "C", "D", "E"), Groups_subgroups = c("1_Cluster17972,1_Cluster2653,1_Cluster3900,2_Cluster12159,1_Cluster1798,1_Cluster17493",
"7_Cluster13022,1_Cluster4976", "2_Cluster27505", "7_Cluster25561,23_Cluster25561",
"1_Cluster23500")), class = "data.frame", row.names = c(NA, -5L
))
df2
structure(list(Groups = c("Cluster17972", "Cluster17972", "Cluster2653",
"Cluster3900", "Cluster12159", "Cluster12159", "Cluster1798",
"Cluster17493", "Cluster00001", "Cluster00001", "Cluster13022",
"Cluster4976", "Cluster27505", "Cluster25561", "Cluster25561",
"Cluster23500"), Subgroups = c(1, 2, 1, 1, 2, 3, 1, 1, 1, 2,
7, 1, 2, 7, 23, 1)), class = "data.frame", row.names = c(NA,
-16L))
Solution 1:
dplyr
library(dplyr)
library(tidyr) # unnest, separate
df1 %>%
mutate(Groups_subgroups = strsplit(Groups_subgroups, "[ ,]+")) %>%
unnest(Groups_subgroups) %>%
separate(Groups_subgroups, into = c("Subgroups", "Groups")) %>%
mutate(Subgroups = as.integer(Subgroups)) %>%
left_join(df2, ., by = c("Groups", "Subgroups"))
# Groups Subgroups Nb
# 1 Cluster17972 1 A
# 2 Cluster17972 2 <NA>
# 3 Cluster2653 1 A
# 4 Cluster3900 1 A
# 5 Cluster12159 2 A
# 6 Cluster12159 3 <NA>
# 7 Cluster1798 1 A
# 8 Cluster17493 1 A
# 9 Cluster00001 1 <NA>
# 10 Cluster00001 2 <NA>
# 11 Cluster13022 7 B
# 12 Cluster4976 1 B
# 13 Cluster27505 2 C
# 14 Cluster25561 7 D
# 15 Cluster25561 23 D
# 16 Cluster23500 1 E
base R
subgr <- strsplit(df1$Groups_subgroups, "[ ,]+")
subgr
# [[1]]
# [1] "1_Cluster17972" "1_Cluster2653" "1_Cluster3900" "2_Cluster12159" "1_Cluster1798" "1_Cluster17493"
# [[2]]
# [1] "7_Cluster13022" "1_Cluster4976"
# [[3]]
# [1] "2_Cluster27505"
# [[4]]
# [1] "7_Cluster25561" "23_Cluster25561"
# [[5]]
# [1] "1_Cluster23500"
groups <- data.frame(Nb = rep(df1$Nb, lengths(subgr)), GSG = unlist(subgr))
groups <- cbind(groups, strcapture("^([^_]+)_(.*)", groups$GSG, list(Subgroups = 0L, Groups = "")))
groups <- groups[,-2]
groups
# Nb Subgroups Groups
# 1 A 1 Cluster17972
# 2 A 1 Cluster2653
# 3 A 1 Cluster3900
# 4 A 2 Cluster12159
# 5 A 1 Cluster1798
# 6 A 1 Cluster17493
# 7 B 7 Cluster13022
# 8 B 1 Cluster4976
# 9 C 2 Cluster27505
# 10 D 7 Cluster25561
# 11 D 23 Cluster25561
# 12 E 1 Cluster23500
With this, we just merge
/join the two objects together:
merge(df2, groups, by = c("Groups", "Subgroups"), all.x = TRUE)
# Groups Subgroups Nb
# 1 Cluster00001 1 <NA>
# 2 Cluster00001 2 <NA>
# 3 Cluster12159 2 A
# 4 Cluster12159 3 <NA>
# 5 Cluster13022 7 B
# 6 Cluster17493 1 A
# 7 Cluster17972 1 A
# 8 Cluster17972 2 <NA>
# 9 Cluster1798 1 A
# 10 Cluster23500 1 E
# 11 Cluster25561 7 D
# 12 Cluster25561 23 D
# 13 Cluster2653 1 A
# 14 Cluster27505 2 C
# 15 Cluster3900 1 A
# 16 Cluster4976 1 B
Solution 2:
First we could prepare df1
with separate_rows
and separate
and
then join with Groups
AND Subgroups
:
library(dplyr)
Library(tidyr)
df1 <- df1 %>%
separate_rows(Groups_subgroups, sep = ',') %>%
separate(Groups_subgroups, c("Subgroups", "Groups")) %>%
type.convert(as.is = TRUE)
df2 %>%
left_join(df1, by=c("Groups", "Subgroups"))
Groups Subgroups Nb
1 Cluster17972 1 A
2 Cluster17972 2 <NA>
3 Cluster2653 1 A
4 Cluster3900 1 A
5 Cluster12159 2 A
6 Cluster12159 3 <NA>
7 Cluster1798 1 A
8 Cluster17493 1 A
9 Cluster00001 1 <NA>
10 Cluster00001 2 <NA>
11 Cluster13022 7 B
12 Cluster4976 1 B
13 Cluster27505 2 C
14 Cluster25561 7 D
15 Cluster25561 23 D
16 Cluster23500 1 E