Reshape data with complete binaries column
I have a dataframe that looks like this :
df=data.frame("Q1"=c("AA","BB","DD","AA"),
"Q2"=c(NA,"AA","EE","BB"),
"Q3"=c(NA,NA,"FF","EE"),
"Q4"=c(NA,NA,"AA",NA),
"Q5"=c(NA,NA,"BB",NA),
"Q6"=c(NA,NA,NA,NA))
Q1 Q2 Q3 Q4 Q5 Q6
1 AA <NA> <NA> <NA> <NA> NA
2 BB AA <NA> <NA> <NA> NA
3 DD EE FF AA BB NA
4 AA BB EE <NA> <NA> NA
That I would like to convert to a binary data.frame (NA to 0 and 1 if it is matching the column name in the precedent line)
df2=data.frame("AA"=c(1,1,1,1),
"BB"=c(0,1,1,1),
"CC"=c(0,0,0,0),
"DD"=c(0,0,1,0),
"EE"=c(0,0,1,1),
"FF"=c(0,0,1,0))
AA BB CC DD EE FF
1 1 0 0 0 0 0
2 1 1 0 0 0 0
3 1 1 0 1 1 1
4 1 1 0 0 1 0
This might be possible with tidyr but I actually don't know how. Thank you in advance for your help.
Solution 1:
If v
is given you could try this.
v <- c('AA', 'BB', 'CC', 'DD', 'EE', 'FF')
+sapply(v, \(x) apply(replace(df, is.na(df), 0) == x, 1, any))
# AA BB CC DD EE FF
# [1,] 1 0 0 0 0 0
# [2,] 1 1 0 0 0 0
# [3,] 1 1 0 1 1 1
# [4,] 1 1 0 0 1 0
Data:
df <- structure(list(Q1 = c("AA", "BB", "DD", "AA"), Q2 = c(NA, "AA",
"EE", "BB"), Q3 = c(NA, NA, "FF", "EE"), Q4 = c(NA, NA, "AA",
NA), Q5 = c(NA, NA, "BB", NA), Q6 = c(NA_character_, NA_character_,
NA_character_, NA_character_)), class = "data.frame", row.names = c(NA,
-4L))
Solution 2:
In tidyr
and apparented packages, it is easier to work with tidy data; so I'd suggest to pivot the dataset and then work on it:
df %>%
pivot_longer(cols = everything()) %>%
group_by(name = cumsum(name == "Q1")) %>%
count(value) %>%
filter(!is.na(value)) %>%
complete(value = c("AA","BB","CC","DD","EE","FF")) %>%
mutate(n = ifelse(is.na(n), 0, 1)) %>%
pivot_wider(names_from = value, values_from = n) %>%
ungroup() %>%
select(-name)
# A tibble: 4 x 6
AA BB CC DD EE FF
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 0 0 0
2 1 1 0 0 0 0
3 1 1 0 1 1 1
4 1 1 0 0 1 0
Solution 3:
data.table
version
df=data.frame("Q1"=c("AA","BB","DD","AA"),
"Q2"=c(NA,"AA","EE","BB"),
"Q3"=c(NA,NA,"FF","EE"),
"Q4"=c(NA,NA,"AA",NA),
"Q5"=c(NA,NA,"BB",NA),
"Q6"=c(NA,NA,NA,NA))
v <- c('AA', 'BB', 'CC', 'DD', 'EE', 'FF')
library(data.table)
setDT(df)
df[
,(v) := lapply(v, function(x){
apply(.SD,1,function(xx){
return(max(as.numeric(x %chin% xx), na.rm = T))
})
})
]
Result
> print(df)
Q1 Q2 Q3 Q4 Q5 Q6 AA BB CC DD EE FF
1: AA <NA> <NA> <NA> <NA> NA 1 0 0 0 0 0
2: BB AA <NA> <NA> <NA> NA 1 1 0 0 0 0
3: DD EE FF AA BB NA 1 1 0 1 1 1
4: AA BB EE <NA> <NA> NA 1 1 0 0 1 0