ggplot2: histogram with normal curve

Think I got it:

set.seed(1)
df <- data.frame(PF = 10*rnorm(1000))
ggplot(df, aes(x = PF)) + 
    geom_histogram(aes(y =..density..),
                   breaks = seq(-50, 50, by = 10), 
                   colour = "black", 
                   fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df$PF), sd = sd(df$PF)))

enter image description here


This has been answered here and partially here.

The area under a density curve equals 1, and the area under the histogram equals the width of the bars times the sum of their height ie. the binwidth times the total number of non-missing observations. To fit both on the same graph, one or other needs to be rescaled so that their areas match.

If you want the y-axis to have frequency counts, there are a number of options:

First simulate some data.

library(ggplot2)

set.seed(1)
dat_hist <- data.frame(
  group = c(rep("A", 200), rep("B",150)),
  value = c(rnorm(200, 20, 5), rnorm(150,25,10)))

# Set desired binwidth and number of non-missing obs
bw = 2
n_obs = sum(!is.na(dat_hist$value))

Option 1: Plot both histogram and density curve as density and then rescale the y axis

This is perhaps the easiest approach for a single histogram. Using the approach suggested by Carlos, plot both histogram and density curve as density

g <- ggplot(dat_hist, aes(value))  + 
geom_histogram(aes(y = ..density..), binwidth = bw, colour = "black") + 
stat_function(fun = dnorm, args = list(mean = mean(dat_hist$value), sd = sd(dat_hist$value)))

And then rescale the y axis.

ybreaks = seq(0,50,5) 
## On primary axis
g + scale_y_continuous("Counts", breaks = round(ybreaks / (bw * n_obs),3), labels = ybreaks)

## Or on secondary axis
g + scale_y_continuous("Density", sec.axis = sec_axis(
  trans = ~ . * bw * n_obs, name = "Counts", breaks = ybreaks))

Single histogram with normal curve

Option 2: Rescale the density curve using stat_function

With code tidied as per PatrickT's answer.

ggplot(dat_hist, aes(value))  + 
  geom_histogram(colour = "black", binwidth = bw) + 
  stat_function(fun = function(x) 
    dnorm(x, mean = mean(dat_hist$value), sd = sd(dat_hist$value)) * bw * n_obs)

Option 3: Create an external dataset and plot using geom_line.

Unlike the above options, this one works with facets. (EDITED to provide dplyr rather than plyr based solution). Note, the summarised dataset is being used as the primary, and the raw passed in for the histogram only.

library(tidyverse)

dat_hist %>% 
  group_by(group) %>% 
  nest(data = c(value)) %>% 
  mutate(y = map(data, ~ dnorm(
    .$value, mean = mean(.$value), sd = sd(.$value)
    ) * bw * sum(!is.na(.$value)))) %>% 
  unnest(c(data,y)) %>% 
  
  ggplot(aes(x = value)) +
  geom_histogram(data = dat_hist, binwidth = bw, colour = "black") +
  geom_line(aes(y = y)) + 
  facet_wrap(~ group)

Histogram with normal curve and facets

Option 4: Create external functions to edit the data on the fly

A bit over the top perhaps, but might be useful for someone?

## Function to create scaled dnorm data along full x axis range
dnorm_scaled <- function(data, x = NULL, binwidth = 1, xlim = NULL) {
  .x <- na.omit(data[,x])
  if(is.null(xlim))
    xlim = c(min(.x), max(.x))
  x_range = seq(xlim[1], xlim[2], length.out = 101)
  setNames(
    data.frame(
    x = x_range,
    y = dnorm(x_range, mean = mean(.x), sd = sd(.x)) * length(.x) * binwidth),
    c(x, "y"))
}

## Function to apply over groups
dnorm_scaled_group <- function(data, x = NULL, group = NULL, binwidth = NULL, xlim = NULL) {
  dat_hists <- lapply(
    split(data, data[, group]), dnorm_scaled,
      x = x, binwidth = binwidth, xlim = xlim)
  for(g in names(dat_hists))
    dat_hists[[g]][, "group"] <- g
  setNames(do.call(rbind, dat_hists), c(x, "y", group))
}

## Single histogram
ggplot(dat_hist, aes(value)) + 
  geom_histogram(binwidth = bw, colour = "black") + 
  geom_line(data = ~ dnorm_scaled(., "value", binwidth = bw), 
            aes(y = y)) 

## With a single faceting variable
ggplot(dat_hist, aes(value))  + 
  geom_histogram(binwidth = 2, colour = "black") + 
  geom_line(data = ~ dnorm_scaled_group(
    ., x = "value", group = "group", binwidth = 2, xlim = c(0,50)), 
    aes(y = y)) +
  facet_wrap(~ group)