Add a column with distribution plot of every variable in a summary table

Solution 1:

Even though I am aware that this might be much easier to do with ggplot, I am always eager to see whether I can achieve a similar result with base R plotting tools. I will make use of the iris data in this example.

We first need to identify which columns of our data.frame are numeric.

# returns logical which is TRUE if column p is numeric
numeric_cols <- c(rep(NA, ncol(iris)))
for(p in seq_len(ncol(iris))) {
  numeric_cols[p] <- inherits(iris[, p], 'numeric')
}

Then, we may select some arbitrary colours for densities. Here, I picked three colours which correspond to the number of levels of iris$Species.

my_cols <- c('blue4', 'darkorange', '#00b0a4')
adj_col <- \(x) adjustcolor(x, alpha.f = 0.2)
my_transp_cols <- c(
  adj_col('blue4'), adj_col('darkorange'), adj_col('#00b0a4')
)

Now we need to plot the densities. The function that is given below (i.e. plot_densities) has an option to either provide marginal densities or densities conditional on some factor variable. If you would like to obtain the densities conditional on some factor variable, simply set include_factor to TRUE and pass the factor variable of interest to the factor argument.

plot_densities <- \(DF, columns, include_factor = FALSE, factor) {
  name_vars <- names(DF)
  DF <- DF[complete.cases(DF[name_vars]), ]
  ## setting up plotting device
  layout(matrix(seq_len(4L), ncol = 4L))
  ## only use the TRUEs indicating numeric columns
  n_cols <- length(columns[columns])
  ## if densities are to be shown per factor level
  if (include_factor) {
    par(mar = c(5, 4, 4, 8) + 0.1, xpd = TRUE)
    lvls <- unique(levels(DF[[factor]]))
    for (i in seq_len(n_cols)) {
      ## preallocation
      max_y <- max_x <- min_x <- rep(NA, length(unique(levels(DF[[factor]]))))
      means <- SDs <- rep(NA, length(unique(levels(DF[[factor]]))))
      no_of_levels <- length(lvls)
      for (j in seq_len(no_of_levels)) {
        ## only proceed with this loop if column i is numeric else next
        if (columns[i]) {
          ## subset consisting values of column i for factor level j
          sub <- subset(DF, DF[[factor]] %in% lvls[j])[, i]
          ## make sure that the densities of column i per factor level j
          ## are depicted in the same panel
          if (j == 1) {
            ## limits for the x and y axes per panel for column i
            for (k in seq_len(no_of_levels)) {
              sub_k <- subset(DF, DF[[factor]] %in% lvls[k])[, i]
              x <- density(sub_k)$x
              y <- density(sub_k)$y
              min_x[k] <- min(x)
              max_x[k] <- max(x)
              max_y[k] <- max(y)
            }
            ## mean and SD for column i per factor level j
            r <- \(x) format(round(x, 1L), nsmall = 1L)
            for (kk in seq_len(no_of_levels)) {
              sub_kk <- subset(DF, DF[[factor]] %in% lvls[kk])[, i]
              means[kk] <- r(mean(sub_kk, na.rm = TRUE))
              SDs[kk] <- r(sd(sub_kk, na.rm = TRUE))
            }
            x_lim <- c(min(min_x), max(max_x))
            y_lim <- c(0L, max(max_y))
            plot(density(sub), main = '',
                 las = 1, col = my_cols[j], xlab = '',
                 xlim = x_lim, ylim = y_lim, bty = 'n')
            title(main = names(DF)[i], xpd = TRUE, adj = 1)
            polygon(density(sub), density = -1L, col = my_transp_cols[j])
          } else {
            lines(density(sub), col = my_cols[j])
            polygon(density(sub), density = -1L, col = my_transp_cols[j])
          }
        } else next
      }
      ## add legend to the plot
      legend('topright', paste0(lvls, ': ', means, ' (', SDs, ')'),
             fill = my_transp_cols, bty = 'n',
             inset = c(-0.5, 0.1))
    }
  } else {
    ## if densities are NOT to be shown per factor level
    for (i in seq_len(n_cols)) {
      par(mar = c(5, 4, 4, 8) + 0.1, xpd = TRUE)
      ## only proceed with this loop if column i is numeric else next
      if (columns[i]) {
        ## mean and SD for column i
        r <- \(x) format(round(x, 1L), nsmall = 1L)
        means <- SDs <- rep(NA, n_cols)
        for(j in seq_len(n_cols)) {
          means[j] <- r(mean(DF[, j], na.rm = TRUE))
          SDs[j] <- r(sd(DF[, j], na.rm = TRUE))
        }
        plot(density(DF[, i]),
             las = 1, main = names(DF)[i], col = my_cols[1L], xlab = '',
             bty = 'n')
        polygon(density(DF[, i]), density = -1L, col = my_transp_cols[1L])
        ## add legend to the plot
        legend('topright', paste0(names(DF)[i], ': ', means[i], ' (', SDs[i], ')'),
               fill = my_transp_cols[1L], bty = 'n',
               inset = c(-0.5, 0.1))
      } else next
    }
  }
}

We can save the output as .pdf file. If you would change the layout of the plotting device, than you would also have to play a little bit with the width and height to make it fit your specific situation.

# marginal densities
pdf(file = 'my_directory/my_plot.pdf', # change my_directory
    width = 13, height = 4) 
plot_densities(DF = iris, columns = numeric_cols)
dev.off()

# conditional densities
pdf(file = 'my_directory/my_plot2.pdf', # change my_directory
    width = 13, height = 4) 
plot_densities(DF = iris, columns = numeric_cols,
               include_factor = TRUE, factor = 'Species')
dev.off()

I usually make .pdf files of my plots and then use this online converting tool to convert them to .png files.

I have shown the mean (SD) in the figure legends, but you may choose to show any statistic you like. Just change mean(sub) and sd(sub) in the function to the statistics of your interest.

Output

Marginal densities enter image description here

Conditional densities enter image description here

Note: use function(x) instead of \(x) if you use a version of R <4.1.0

Add a column with distribution plot of every variable in a summary table

Solution 1:

Related

Recent Posts