Extract top positive and negative values from dataframe and fill them into a formatted text using R

I'm trying to extract material infos whose price increase and decrease the most top 3 base on pct_change column.

Data:

df <- structure(list(material = c("Copper", "Aluminum", "Iron", "Zinc", 
"Nickel", "Silver", "Gold", "Tin"), price = c(17125, 8312, 2228.5, 
2934, 4315, 8178, 4411, 680), pct_change = c(0.025449102, 0, 
-0.024939838, 0.062470043, -0.043873255, -0.004625122, 0.045031392, 
-0.037508846)), class = "data.frame", row.names = c(NA, -8L))

My expected result will be a paragraph of text as follows:

text <- 'The top 3 commodities that price rise most are: Zinc (6.25%), Gold (4.5%), and Copper (2.54%), 
the top 3 commodities that fall most are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%).'

My trial code works, but not concise, does someone could share other more efficient solutions? Thanks.

top3 <- df %>% 
  arrange(desc(pct_change)) %>% 
  mutate(pct_change=scales::percent(pct_change)) %>% 
  slice_head(n=3)

tail3 <- df %>% 
  arrange(pct_change) %>% 
  mutate(pct_change=scales::percent(pct_change)) %>% 
  slice_head(n=3)

com_name_up1 <- top3$material[1]
com_pct_up1 <- top3$pct_change[1]

com_name_up2 <- top3$material[2]
com_pct_up2 <- top3$pct_change[2]

com_name_up3 <- top3$material[3]
com_pct_up3 <- top3$pct_change[3]

com_name_down1 <- tail3$material[1]
com_pct_down1 <- tail3$pct_change[1]

com_name_down2 <- tail3$material[2]
com_pct_down2 <- tail3$pct_change[2]

com_name_down3 <- tail3$material[3]
com_pct_down3 <- tail3$pct_change[3]

text <- glue('The top 3 commodities that price rose most are: {com_name_up1} ({com_pct_up1}), 
{com_name_up2} ({com_pct_up2}), and {com_name_up3} ({com_pct_up3}), 
the top 3 commodities that fell most are: {com_name_down1} ({com_pct_down1}), 
             {com_name_down2} ({com_pct_down2}) and {com_name_down3} ({com_pct_down3}).')

Updated text templates to consider cases pct_change with all negative or positive, or less than 3 positive and negative values:

if pct_change values are all positive:

'All {n} commodities price rose, the top 3 rose most are: {top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}.'
if pct_change values are all negatives:

'All {n} commodities price fell, the top 3 fell most are: {top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}'
if positive, zero and negative pct_change values both exists, I will set n1_1, n2_1, and n3_1 for counting numbers, n1_2, n2_2, and n3_3 for getting top n, whose maximum values are 3:

'We have {n1_1} commodities that price rose, the top {n1_2} rose most are: {top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}; \ {n2_1} commodities that price keep the same, including: {same3[[1L]]}, {same3[[2L]]}, and {same3[[3L]]}; \ {n3_1} commodities that price fell, the top {n3_2} fell most are: {bot3[[1L]]}, {bot3[[2L]]}, and {bot3[[3L]]}.'

Final code, contributed by @ekoam:

to_string <- function(x, sep = ", ") {
  if (length(x) < 2L)
    return(x)
  out <- character(length(x) + length(x) - 1L)
  out[seq.int(1L, by = 2L, length.out = length(x))] <- x
  out[seq.int(2L, by = 2L, length.out = length(x) - 1L)] <- sep
  out[[length(out) - 1L]] <- " and "
  paste0(out, collapse = "")
}

text_summary <- function(df) {
  switch(
    df$sign[[1L]] + 2L, 
    \(x) dplyr::slice_min(x, pct_change, n = 3L), 
    \(x) dplyr::slice_sample(x, n = 3L), 
    \(x) dplyr::slice_max(x, pct_change, n = 3L)
  )(df[, c("material", "pct_change")]) |> 
    dplyr::mutate(pct_change = scales::percent(pct_change)) |> 
    glue::glue_data("{material} ({pct_change})")
}

plural <- function(w, n) {
  data <- c("commodity" = "commodities", "is" = "are")
  if (n == 1L)
    return(w)
  data[[w]]
}

content <- df |> 
  dplyr::group_by(sign = sign(pct_change)) |> 
  dplyr::summarize(
    n = dplyr::n(), 
    text = text_summary(dplyr::cur_data_all()), 
    .groups = "keep"
  )

s1 <- all(content$sign < 0L)
s2 <- all(content$sign > 0L)

tmpl <- 
  if (s1) {
    c("commodities price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}", "", "")
  } else if (s2) {
    c("", "", "commodities price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
  } else {
    c("{n[[1L]]} {plural('commodity',n[[1L]])} that price fell, the commodities with larger price declines are: {plural('is', length(text))}: {to_string(text)}",
      "{n[[1L]]} {plural('commodity',n[[1L]])} that price kept the same, including: {to_string(text)}",
      "{n[[1L]]} {plural('commodity',n[[1L]])} that price rose, the commodities with larger increases are: {plural('is', length(text))}: {to_string(text)}")
  }

prefix <- c("We have ", "All the ")[[(s1 || s2) + 1L]]

content <- content |> 
  dplyr::mutate(tmpl = tmpl[sign + 2L]) |> 
  dplyr::summarize(text = glue::glue(tmpl[[1L]])) |> 
  dplyr::arrange(-sign) |> 
  dplyr::summarize(text = paste0(prefix, to_string(text, "; "), "."))

content$text

Out:

"We have 3 commodities that price rose, the commodities with larger increases are: are: Zinc (6.2%), Gold (4.5%) and Copper (2.5%); 1 commodity that price kept the same, including: Aluminum (0%) and 4 commodities that price fell, the commodities with larger price declines are: are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%)."

Solution 1:

Two general suggestions:

glue has a very flexible syntax that allows you to pass any valid R expression into the "{...}". Utilizing this feature will help shorten your code.
You should abstract out the parts with similar structures and summarize them into a function.

Here is the code

report3 <- function(df, f) {
  df |> 
    f(pct_change, n = 3L) |> 
    dplyr::mutate(pct_change = scales::percent(pct_change)) |> 
    glue::glue_data("{material} ({pct_change})")
}
top3 <- report3(df, dplyr::slice_max)
bot3 <- report3(df, dplyr::slice_min)
text <- glue::glue('The top 3 commodities that price rose most are: \\
                   {top3[[1L]]}, {top3[[2L]]}, and {top3[[3L]]}; \\
                   the top 3 commodities that fell most are: \\
                   {bot3[[1L]]}, {bot3[[2L]]} and {bot3[[3L]]}.')

Output

> text
The top 3 commodities that price rose most are: Zinc (6.2%), Gold (4.5%), and Copper (2.5%); the top 3 commodities that fell most are: Nickel (-4.39%), Tin (-3.75%) and Iron (-2.49%).

The code below should be sufficient to cover all the cases you provided

to_string <- function(x, sep = ", ") {
  if (length(x) < 2L)
    return(x)
  out <- character(length(x) + length(x) - 1L)
  out[seq.int(1L, by = 2L, length.out = length(x))] <- x
  out[seq.int(2L, by = 2L, length.out = length(x) - 1L)] <- sep
  out[[length(out) - 1L]] <- " and "
  paste0(out, collapse = "")
}

text_summary <- function(df) {
  switch(
    df$sign[[1L]] + 2L, 
    \(x) dplyr::slice_min(x, pct_change, n = 3L), 
    \(x) dplyr::slice_sample(x, n = 3L), 
    \(x) dplyr::slice_max(x, pct_change, n = 3L)
  )(df[, c("material", "pct_change")]) |> 
    dplyr::mutate(pct_change = scales::percent(pct_change)) |> 
    glue::glue_data("{material} ({pct_change})")
}

plural <- function(w, n) {
  data <- c("commodity" = "commodities", "is" = "are")
  if (n == 1L)
    return(w)
  data[[w]]
}

content <- df |> 
  dplyr::group_by(sign = sign(pct_change)) |> 
  dplyr::summarize(
    n = dplyr::n(), 
    text = text_summary(dplyr::cur_data_all()), 
    .groups = "keep"
  )

s1 <- all(content$sign < 0L)
s2 <- all(content$sign > 0L)

tmpl <- 
  if (s1) {
    c("commodities price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}", "", "")
  } else if (s2) {
    c("", "", "commodities price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
  } else {
    c("{n[[1L]]} {plural('commodity',n[[1L]])} that price fell, the top {length(text)} fell most {plural('is', length(text))}: {to_string(text)}",
      "{n[[1L]]} {plural('commodity',n[[1L]])} that price kept the same, including: {to_string(text)}",
      "{n[[1L]]} {plural('commodity',n[[1L]])} that price rose, the top {length(text)} rose most {plural('is', length(text))}: {to_string(text)}")
  }

prefix <- c("We have ", "All the ")[[(s1 || s2) + 1L]]

content <- content |> 
  dplyr::mutate(tmpl = tmpl[sign + 2L]) |> 
  dplyr::summarize(text = glue::glue(tmpl[[1L]])) |> 
  dplyr::arrange(-sign) |> 
  dplyr::summarize(text = paste0(prefix, to_string(text, "; "), "."))

content$text

Solution 2:

To be honest I'm not sure if it's actually much shorter but you could look to glue the material and pct_change within the table first.

I've then grouped it up and collapsed the strings

df %>% 
  arrange(desc(pct_change)) %>% 
  mutate(
    t1 = sprintf('%s (%.2f%%)', material, pct_change*100),
    rank1 = case_when(
      row_number() <= 3 ~ 'Top',
      row_number() > n() -3 ~ 'Bot'
    )
  ) %>%
  group_by(rank1) %>%
  summarise(
    t2 = paste(t1, collapse = ', ')
  )

  rank1 t2                                          
  <chr> <chr>                                       
1 Bot   Iron (-2.49%), Tin (-3.75%), Nickel (-4.39%)
2 Top   Zinc (6.25%), Gold (4.50%), Copper (2.54%)  
3 NA    Aluminum (0.72%), Silver (-0.46%)

Solution 3:

Another possible solution (instead of stringr::str_c, you could use the more convenient stringr::str_glue, as @ekoam well suggests):

library(tidyverse)

df <- structure(list(material = c("Copper", "Aluminum", "Iron", "Zinc", 
"Nickel", "Silver", "Gold", "Tin"), price = c(17125, 8312, 2228.5, 
2934, 4315, 8178, 4411, 680), pct_change = c(0.025449102, 0.007166746, 
-0.024939838, 0.062470043, -0.043873255, -0.004625122, 0.045031392, 
-0.037508846)), class = "data.frame", row.names = c(NA, -8L))

top3 <- slice_max(df, pct_change, n = 3)
bottom3 <- slice_min(df, pct_change, n = 3)

str_c("The top 3 commodities that price rise most are: ", 
top3$material[1]," (", round(100*top3$pct_change[1], 2),"%), ",
top3$material[2]," (", round(100*top3$pct_change[2],2),"%), and ", 
top3$material[3]," (", round(100*top3$pct_change[3]),"%), the top 3 
commodities that fall most are: ", bottom3$material[1]," (", 
round(100*bottom3$pct_change[1], 2),"%), ", bottom3$material[2]," (", 
round(100*bottom3$pct_change[2],2),"%), and ", bottom3$material[3]," 
(", round(100*bottom3$pct_change[3]),"%).")

#> [1] "The top 3 commodities that price rise most are: Zinc (6.25%), Gold (4.5%), and Copper (3%), the top 3 commodities that fall most are: Nickel (-4.39%), Tin (-3.75%), and Iron (-2%)."

Extract top positive and negative values from dataframe and fill them into a formatted text using R

Solution 1:

Solution 2:

Solution 3:

Related

Recent Posts