ggplot2: histogram with normal curve
Think I got it:
set.seed(1)
df <- data.frame(PF = 10*rnorm(1000))
ggplot(df, aes(x = PF)) +
geom_histogram(aes(y =..density..),
breaks = seq(-50, 50, by = 10),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df$PF), sd = sd(df$PF)))
This has been answered here and partially here.
The area under a density curve equals 1, and the area under the histogram equals the width of the bars times the sum of their height ie. the binwidth times the total number of non-missing observations. To fit both on the same graph, one or other needs to be rescaled so that their areas match.
If you want the y-axis to have frequency counts, there are a number of options:
First simulate some data.
library(ggplot2)
set.seed(1)
dat_hist <- data.frame(
group = c(rep("A", 200), rep("B",150)),
value = c(rnorm(200, 20, 5), rnorm(150,25,10)))
# Set desired binwidth and number of non-missing obs
bw = 2
n_obs = sum(!is.na(dat_hist$value))
Option 1: Plot both histogram and density curve as density and then rescale the y axis
This is perhaps the easiest approach for a single histogram. Using the approach suggested by Carlos, plot both histogram and density curve as density
g <- ggplot(dat_hist, aes(value)) +
geom_histogram(aes(y = ..density..), binwidth = bw, colour = "black") +
stat_function(fun = dnorm, args = list(mean = mean(dat_hist$value), sd = sd(dat_hist$value)))
And then rescale the y axis.
ybreaks = seq(0,50,5)
## On primary axis
g + scale_y_continuous("Counts", breaks = round(ybreaks / (bw * n_obs),3), labels = ybreaks)
## Or on secondary axis
g + scale_y_continuous("Density", sec.axis = sec_axis(
trans = ~ . * bw * n_obs, name = "Counts", breaks = ybreaks))
Option 2: Rescale the density curve using stat_function
With code tidied as per PatrickT's answer.
ggplot(dat_hist, aes(value)) +
geom_histogram(colour = "black", binwidth = bw) +
stat_function(fun = function(x)
dnorm(x, mean = mean(dat_hist$value), sd = sd(dat_hist$value)) * bw * n_obs)
Option 3: Create an external dataset and plot using geom_line.
Unlike the above options, this one works with facets. (EDITED to provide dplyr
rather than plyr
based solution). Note, the summarised dataset is being used as the primary, and the raw passed in for the histogram only.
library(tidyverse)
dat_hist %>%
group_by(group) %>%
nest(data = c(value)) %>%
mutate(y = map(data, ~ dnorm(
.$value, mean = mean(.$value), sd = sd(.$value)
) * bw * sum(!is.na(.$value)))) %>%
unnest(c(data,y)) %>%
ggplot(aes(x = value)) +
geom_histogram(data = dat_hist, binwidth = bw, colour = "black") +
geom_line(aes(y = y)) +
facet_wrap(~ group)
Option 4: Create external functions to edit the data on the fly
A bit over the top perhaps, but might be useful for someone?
## Function to create scaled dnorm data along full x axis range
dnorm_scaled <- function(data, x = NULL, binwidth = 1, xlim = NULL) {
.x <- na.omit(data[,x])
if(is.null(xlim))
xlim = c(min(.x), max(.x))
x_range = seq(xlim[1], xlim[2], length.out = 101)
setNames(
data.frame(
x = x_range,
y = dnorm(x_range, mean = mean(.x), sd = sd(.x)) * length(.x) * binwidth),
c(x, "y"))
}
## Function to apply over groups
dnorm_scaled_group <- function(data, x = NULL, group = NULL, binwidth = NULL, xlim = NULL) {
dat_hists <- lapply(
split(data, data[, group]), dnorm_scaled,
x = x, binwidth = binwidth, xlim = xlim)
for(g in names(dat_hists))
dat_hists[[g]][, "group"] <- g
setNames(do.call(rbind, dat_hists), c(x, "y", group))
}
## Single histogram
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = bw, colour = "black") +
geom_line(data = ~ dnorm_scaled(., "value", binwidth = bw),
aes(y = y))
## With a single faceting variable
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = 2, colour = "black") +
geom_line(data = ~ dnorm_scaled_group(
., x = "value", group = "group", binwidth = 2, xlim = c(0,50)),
aes(y = y)) +
facet_wrap(~ group)