Add a column with distribution plot of every variable in a summary table
Solution 1:
Even though I am aware that this might be much easier to do with ggplot
, I am always eager to see whether I can achieve a similar result with base R plotting tools. I will make use of the iris
data in this example.
We first need to identify which columns of our data.frame
are numeric.
# returns logical which is TRUE if column p is numeric
numeric_cols <- c(rep(NA, ncol(iris)))
for(p in seq_len(ncol(iris))) {
numeric_cols[p] <- inherits(iris[, p], 'numeric')
}
Then, we may select some arbitrary colours for densities. Here, I picked three colours which correspond to the number of levels of iris$Species
.
my_cols <- c('blue4', 'darkorange', '#00b0a4')
adj_col <- \(x) adjustcolor(x, alpha.f = 0.2)
my_transp_cols <- c(
adj_col('blue4'), adj_col('darkorange'), adj_col('#00b0a4')
)
Now we need to plot the densities. The function that is given below (i.e. plot_densities
) has an option to either provide marginal densities or densities conditional on some factor variable. If you would like to obtain the densities conditional on some factor variable, simply set include_factor
to TRUE
and pass the factor variable of interest to the factor
argument.
plot_densities <- \(DF, columns, include_factor = FALSE, factor) {
name_vars <- names(DF)
DF <- DF[complete.cases(DF[name_vars]), ]
## setting up plotting device
layout(matrix(seq_len(4L), ncol = 4L))
## only use the TRUEs indicating numeric columns
n_cols <- length(columns[columns])
## if densities are to be shown per factor level
if (include_factor) {
par(mar = c(5, 4, 4, 8) + 0.1, xpd = TRUE)
lvls <- unique(levels(DF[[factor]]))
for (i in seq_len(n_cols)) {
## preallocation
max_y <- max_x <- min_x <- rep(NA, length(unique(levels(DF[[factor]]))))
means <- SDs <- rep(NA, length(unique(levels(DF[[factor]]))))
no_of_levels <- length(lvls)
for (j in seq_len(no_of_levels)) {
## only proceed with this loop if column i is numeric else next
if (columns[i]) {
## subset consisting values of column i for factor level j
sub <- subset(DF, DF[[factor]] %in% lvls[j])[, i]
## make sure that the densities of column i per factor level j
## are depicted in the same panel
if (j == 1) {
## limits for the x and y axes per panel for column i
for (k in seq_len(no_of_levels)) {
sub_k <- subset(DF, DF[[factor]] %in% lvls[k])[, i]
x <- density(sub_k)$x
y <- density(sub_k)$y
min_x[k] <- min(x)
max_x[k] <- max(x)
max_y[k] <- max(y)
}
## mean and SD for column i per factor level j
r <- \(x) format(round(x, 1L), nsmall = 1L)
for (kk in seq_len(no_of_levels)) {
sub_kk <- subset(DF, DF[[factor]] %in% lvls[kk])[, i]
means[kk] <- r(mean(sub_kk, na.rm = TRUE))
SDs[kk] <- r(sd(sub_kk, na.rm = TRUE))
}
x_lim <- c(min(min_x), max(max_x))
y_lim <- c(0L, max(max_y))
plot(density(sub), main = '',
las = 1, col = my_cols[j], xlab = '',
xlim = x_lim, ylim = y_lim, bty = 'n')
title(main = names(DF)[i], xpd = TRUE, adj = 1)
polygon(density(sub), density = -1L, col = my_transp_cols[j])
} else {
lines(density(sub), col = my_cols[j])
polygon(density(sub), density = -1L, col = my_transp_cols[j])
}
} else next
}
## add legend to the plot
legend('topright', paste0(lvls, ': ', means, ' (', SDs, ')'),
fill = my_transp_cols, bty = 'n',
inset = c(-0.5, 0.1))
}
} else {
## if densities are NOT to be shown per factor level
for (i in seq_len(n_cols)) {
par(mar = c(5, 4, 4, 8) + 0.1, xpd = TRUE)
## only proceed with this loop if column i is numeric else next
if (columns[i]) {
## mean and SD for column i
r <- \(x) format(round(x, 1L), nsmall = 1L)
means <- SDs <- rep(NA, n_cols)
for(j in seq_len(n_cols)) {
means[j] <- r(mean(DF[, j], na.rm = TRUE))
SDs[j] <- r(sd(DF[, j], na.rm = TRUE))
}
plot(density(DF[, i]),
las = 1, main = names(DF)[i], col = my_cols[1L], xlab = '',
bty = 'n')
polygon(density(DF[, i]), density = -1L, col = my_transp_cols[1L])
## add legend to the plot
legend('topright', paste0(names(DF)[i], ': ', means[i], ' (', SDs[i], ')'),
fill = my_transp_cols[1L], bty = 'n',
inset = c(-0.5, 0.1))
} else next
}
}
}
We can save the output as .pdf file. If you would change the layout
of the plotting device, than you would also have to play a little bit with the width
and height
to make it fit your specific situation.
# marginal densities
pdf(file = 'my_directory/my_plot.pdf', # change my_directory
width = 13, height = 4)
plot_densities(DF = iris, columns = numeric_cols)
dev.off()
# conditional densities
pdf(file = 'my_directory/my_plot2.pdf', # change my_directory
width = 13, height = 4)
plot_densities(DF = iris, columns = numeric_cols,
include_factor = TRUE, factor = 'Species')
dev.off()
I usually make .pdf files of my plots and then use this online converting tool to convert them to .png files.
I have shown the mean (SD) in the figure legends, but you may choose to show any statistic you like. Just change mean(sub)
and sd(sub)
in the function to the statistics of your interest.
Output
Marginal densities
Conditional densities
Note: use function(x)
instead of \(x)
if you use a version of R <4.1.0