Parse multiple element XML values into a R dataframe

This is a solution I think with xml2 and a quick and verbose tidy verse approach.

Not always easy to have a compact code when importing nested xml data.


library(xml2)
library(dplyr)
library(tidyr)
library(purrr)

# read the xml file
te <- xml2::read_xml("~/Desktop/test_so.xml")

# to move from node to node
cursor <- xml2::xml_find_all(te, ".//Name")

# i <- 1L
seq_along(cursor) %>% 
  # map to move along cursor
  purrr::map_df(function(i){
    print(i)
    x <- cursor[i]
    # first part is Name and Type (Alternate/Disease/Preferred)
    dplyr::tibble(Type = xml2::xml_attr(xml_find_all(x, './/ElementValue'), 'Type'),
         Trait = xml2::xml_text(x)) -> temp
    
    # second part is not always here so test if it exists before, then
    # if it exists, extract and compact it (nest)
    if (!is.na(xml2::xml_text(xml2::xml_find_first(x, './/XRef')))){
      Details <- dplyr::tibble(
         DB = xml2::xml_find_all(x, './/XRef') %>% 
           xml2::xml_attr('DB'),
         ID = xml2::xml_find_all(x, './/XRef') %>% 
           xml2::xml_attr('ID'),
         Type_ = xml2::xml_find_all(x, './/XRef') %>% 
           xml2::xml_attr('Type')) %>% tidyr::nest(data = c('DB', 'ID', 'Type_'))
      } else {
      # if it doesn't exist, fill with an empty df and compact it too (nest)
      Details <- dplyr::tibble(DB = NA, ID = NA, Type_ = NA) %>% 
        tidyr::nest(data = c('DB', 'ID', 'Type_'))
      }
    # add this new "df" column to temporary object
    temp <- temp %>% dplyr::mutate(Details = Details$data)
    # return
    temp
  }) -> te2

result <- te2 %>% 
  unnest(Details)
# A tibble: 44 × 5
   Type      Trait                                                 DB               ID                                       Type_          
   <chr>     <chr>                                                 <chr>            <chr>                                    <chr>          
 1 Preferred Breast-ovarian cancer, familial 1                     Genetic Alliance Breast-ovarian+cancer%2C+familial+1/7865 NA             
 2 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             604370                                   MIM            
 3 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0001                              Allelic variant
 4 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0002                              Allelic variant
 5 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0003                              Allelic variant
 6 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0004                              Allelic variant
 7 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0005                              Allelic variant
 8 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0006                              Allelic variant
 9 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0007                              Allelic variant
10 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0008                              Allelic variant
# … with 34 more rows
>