Parse multiple element XML values into a R dataframe
This is a solution I think with xml2 and a quick and verbose tidy verse approach.
Not always easy to have a compact code when importing nested xml data.
library(xml2)
library(dplyr)
library(tidyr)
library(purrr)
# read the xml file
te <- xml2::read_xml("~/Desktop/test_so.xml")
# to move from node to node
cursor <- xml2::xml_find_all(te, ".//Name")
# i <- 1L
seq_along(cursor) %>%
# map to move along cursor
purrr::map_df(function(i){
print(i)
x <- cursor[i]
# first part is Name and Type (Alternate/Disease/Preferred)
dplyr::tibble(Type = xml2::xml_attr(xml_find_all(x, './/ElementValue'), 'Type'),
Trait = xml2::xml_text(x)) -> temp
# second part is not always here so test if it exists before, then
# if it exists, extract and compact it (nest)
if (!is.na(xml2::xml_text(xml2::xml_find_first(x, './/XRef')))){
Details <- dplyr::tibble(
DB = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('DB'),
ID = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('ID'),
Type_ = xml2::xml_find_all(x, './/XRef') %>%
xml2::xml_attr('Type')) %>% tidyr::nest(data = c('DB', 'ID', 'Type_'))
} else {
# if it doesn't exist, fill with an empty df and compact it too (nest)
Details <- dplyr::tibble(DB = NA, ID = NA, Type_ = NA) %>%
tidyr::nest(data = c('DB', 'ID', 'Type_'))
}
# add this new "df" column to temporary object
temp <- temp %>% dplyr::mutate(Details = Details$data)
# return
temp
}) -> te2
result <- te2 %>%
unnest(Details)
# A tibble: 44 × 5
Type Trait DB ID Type_
<chr> <chr> <chr> <chr> <chr>
1 Preferred Breast-ovarian cancer, familial 1 Genetic Alliance Breast-ovarian+cancer%2C+familial+1/7865 NA
2 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 604370 MIM
3 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0001 Allelic variant
4 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0002 Allelic variant
5 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0003 Allelic variant
6 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0004 Allelic variant
7 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0005 Allelic variant
8 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0006 Allelic variant
9 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0007 Allelic variant
10 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM 113705.0008 Allelic variant
# … with 34 more rows
>