Reshape data with complete binaries column

I have a dataframe that looks like this :

df=data.frame("Q1"=c("AA","BB","DD","AA"),
              "Q2"=c(NA,"AA","EE","BB"),
              "Q3"=c(NA,NA,"FF","EE"),
              "Q4"=c(NA,NA,"AA",NA),
              "Q5"=c(NA,NA,"BB",NA),
              "Q6"=c(NA,NA,NA,NA))

  Q1   Q2   Q3   Q4   Q5 Q6
1 AA <NA> <NA> <NA> <NA> NA
2 BB   AA <NA> <NA> <NA> NA
3 DD   EE   FF   AA   BB NA
4 AA   BB   EE <NA> <NA> NA

That I would like to convert to a binary data.frame (NA to 0 and 1 if it is matching the column name in the precedent line)

df2=data.frame("AA"=c(1,1,1,1),
               "BB"=c(0,1,1,1),
               "CC"=c(0,0,0,0),
               "DD"=c(0,0,1,0),
               "EE"=c(0,0,1,1),
               "FF"=c(0,0,1,0))

  AA BB CC DD EE FF
1  1  0  0  0  0  0
2  1  1  0  0  0  0
3  1  1  0  1  1  1
4  1  1  0  0  1  0

This might be possible with tidyr but I actually don't know how. Thank you in advance for your help.


Solution 1:

If v is given you could try this.

v <- c('AA', 'BB', 'CC', 'DD', 'EE', 'FF')

+sapply(v, \(x) apply(replace(df, is.na(df), 0) == x, 1, any))
#      AA BB CC DD EE FF
# [1,]  1  0  0  0  0  0
# [2,]  1  1  0  0  0  0
# [3,]  1  1  0  1  1  1
# [4,]  1  1  0  0  1  0

Data:

df <- structure(list(Q1 = c("AA", "BB", "DD", "AA"), Q2 = c(NA, "AA", 
"EE", "BB"), Q3 = c(NA, NA, "FF", "EE"), Q4 = c(NA, NA, "AA", 
NA), Q5 = c(NA, NA, "BB", NA), Q6 = c(NA_character_, NA_character_, 
NA_character_, NA_character_)), class = "data.frame", row.names = c(NA, 
-4L))

Solution 2:

In tidyr and apparented packages, it is easier to work with tidy data; so I'd suggest to pivot the dataset and then work on it:

df %>% 
  pivot_longer(cols = everything()) %>% 
  group_by(name = cumsum(name == "Q1")) %>% 
  count(value) %>% 
  filter(!is.na(value)) %>% 
  complete(value = c("AA","BB","CC","DD","EE","FF")) %>% 
  mutate(n = ifelse(is.na(n), 0, 1)) %>% 
  pivot_wider(names_from = value, values_from = n) %>% 
  ungroup() %>% 
  select(-name)

# A tibble: 4 x 6
     AA    BB    CC    DD    EE    FF
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     0     0     0     0     0
2     1     1     0     0     0     0
3     1     1     0     1     1     1
4     1     1     0     0     1     0

Solution 3:

data.table version

df=data.frame("Q1"=c("AA","BB","DD","AA"),
          "Q2"=c(NA,"AA","EE","BB"),
          "Q3"=c(NA,NA,"FF","EE"),
          "Q4"=c(NA,NA,"AA",NA),
          "Q5"=c(NA,NA,"BB",NA),
          "Q6"=c(NA,NA,NA,NA))
v <- c('AA', 'BB', 'CC', 'DD', 'EE', 'FF')


library(data.table)

setDT(df)
df[
  ,(v) := lapply(v, function(x){
    apply(.SD,1,function(xx){
      return(max(as.numeric(x %chin% xx), na.rm = T))
    })
  })
]

Result

>  print(df)
   Q1   Q2   Q3   Q4   Q5 Q6 AA BB CC DD EE FF
1: AA <NA> <NA> <NA> <NA> NA  1  0  0  0  0  0
2: BB   AA <NA> <NA> <NA> NA  1  1  0  0  0  0
3: DD   EE   FF   AA   BB NA  1  1  0  1  1  1
4: AA   BB   EE <NA> <NA> NA  1  1  0  0  1  0