Adding Rows at Different Positions to a Contingency Table in R

I am using R. For this random data set that I generated, I created the following contingency table:

library(memisc)
library(dplyr)

set.seed(123)

v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015") 
v2 <- c("A", "B", "C", "D", "E")
v3 <- c("Z", "Y", "X", "W" )
v4 <- c("data_1", "data_2", "data_3", "data_4" )


dates <- as.factor(sample(v1, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

types <- as.factor(sample(v2,1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

types2 <- as.factor(sample(v3, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.1, 0.1)))

names <- as.factor(sample(v3, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.1, 0.1)))

var = rnorm(1000,10,10)

problem_data = data.frame(var,dates, types, types2, names)


summary <- xtabs(~dates+names+types+types2, problem_data)
t = ftable(summary, row.vars=1, col.vars=2:4)

show_html(t)

enter image description here

If I wanted to add a row containing "Grand Totals" to the bottom of this table, I could do this as follows:

totals <- problem_data %>% group_by(names,  types, types2) %>% summarise(totals = n())
memisc::show_html(rbind(t, totals = totals$totals), varinfront = FALSE)

enter image description here

Is it possible to add "totals" at arbitrary positions in this contingency table?

For example, suppose I want to find the totals for the first two rows (2010-2011, 2011-2012), and then insert this total in this table at the third row. I can calculate the totals for the firs two rows:

first_two_rows = subset(problem_data, dates %in% c("2010-2011","2011-2012"))

totals_first_two_rows <- first_two_rows %>% group_by(names,  types, types2) %>% summarise(totals = n())

But how can this "totals_first_two_rows" be added to the third position of the contingency table? Using this stackoverflow post (Add new row to dataframe, at specific row-index, not appended?), I tried using the function provided in the answer:

insertRow <- function(existingDF, newrow, r) {
    existingDF[seq(r+1,nrow(existingDF)+1),] <- existingDF[seq(r,nrow(existingDF)),]
    existingDF[r,] <- newrow
    existingDF
}

insertRow(t, totals_first_two_rows, 3)

But this returns the following error:

Error in `[<-`(`*tmp*`, seq(r + 1, nrow(existingDF) + 1), , value = existingDF[seq(r,  : 
  subscript out of bounds

Can someone please show me how to fix this problem?

Thanks!


Solution 1:

insertRow doesn't work because t isn't data.frame (It surprised me that rbind(t, totals = totals$totals) works).
If you want uncommon table formats, I think you can't avoid to make it semi-manually.
It takes time but you can fully customize.

I introduce package flextable (Below is an example):

NOTE: !!sym(str) and !!!syms(strs) are technic to use string colname(s) in dplyr function.
e.g., iris %>% mutate(!!sym("colname") := !!sym("Sepal.Length") * 10) means iris %>% mutate(colname = Sepal.Length * 10)

loading package and data preparation

library(memisc)
library(dplyr)
library(tidyr)
library(flextable)
library(officer)

set.seed(123)
v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015") 
v2 <- c("A", "B", "C", "D", "E")
v3 <- c("Z", "Y", "X", "W" )
v4 <- c("data_1", "data_2", "data_3", "data_4" )

dates <- as.factor(sample(v1, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
types <- as.factor(sample(v2,1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
types2 <- as.factor(sample(v3, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.1, 0.1)))
names <- as.factor(sample(v4, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.1, 0.1)))  # modified
var <- rnorm(1000,10,10)

problem_data <- data.frame(var,dates, types, types2, names)
contingency_table <- problem_data %>%    # this code comes from your previous question (a little changed).
  group_by(dates, names, types, types2) %>% 
  summarise(value = n(), .groups = "drop")  # I summarize_function (from mean() to n() and colname).

first_two_rows = subset(problem_data, dates %in% c("2010-2011","2011-2012"))
totals_first_two_rows <- first_two_rows %>% group_by(names,  types, types2) %>% summarise(totals = n(), .groups = "drop")

making df that you want to output (but header is pasted)

# variable preparation
ind_cols <- c("names", "types", "types2")
ind_rows <- c("dates")
ind_sep = "__"    # use to make temp_header (pasted to single string; e.g., header1__header2__ ...)

# convet long to wide 
base_data <- contingency_table %>% 
  tidyr::pivot_wider(names_from = ind_cols, values_from = value, 
                     names_sep = ind_sep, names_sort = TRUE) %>% 
  arrange_at(ind_rows) %>% 
  mutate_if(is.factor, as.character)

#   dates     data_1__A__W data_1__A__X data_1__A__Y ...
#   <chr>            <int>        <int>        <int>
# 1 2010-2011            4            2           21
# 2 2011-2012            2            2            7
# 3 2012-2013           NA           NA            2
# 4 2013-2014            2            2            1
# 5 2014-2015            1           NA            6

totals_f2r_wide <- totals_first_two_rows %>% 
  mutate(dates = "totals") %>%    # this colname is important. it must be same as the colname what you want to put the value.
  tidyr::pivot_wider(names_from = ind_cols, values_from = totals, 
                     names_sep = ind_sep, names_sort = TRUE)

#   dates  data_1__A__W data_1__A__X data_1__A__Y
#   <chr>         <int>        <int>        <int>
# 1 totals            6            4           28


base_data2 <- bind_rows(base_data[1:2,], totals_f2r_wide, base_data[3:nrow(base_data),]) %>% 
  mutate_if(is.numeric, ~ replace_na(.x, 0))

## add header description col
# if you don't want it, please skip this part.
base_data3 <- base_data2 %>% 
  mutate(!!sym(paste0(ind_cols, sep = ":", collapse = ind_sep)) := NA) %>% 
  select(one_of(ind_rows), paste0(ind_cols, sep = ":", collapse = ind_sep), names(.))  # column order change

#   dates     `names:__types:__types2:` data_1__A__W data_1__A__X
#   <chr>     <lgl>                            <dbl>        <dbl>
# 1 2010-2011 NA                                   4            2
# 2 2011-2012 NA                                   2            2
# 3 totals    NA                                   6            4
# 4 2012-2013 NA                                   0            0
# 5 2013-2014 NA                                   2            2
# 6 2014-2015 NA                                   1            0

header info preparation

culculating that how many rows is each element of header across.

header_info_maker <- function(base_data) {

  pasted_ind <- base_data %>% colnames()
  ind_num <- length(ind_cols)

  ind_d <- tibble(a = pasted_ind) %>% 
    separate(a, into = letters[1:ind_num], sep = ind_sep, fill = "left") %>%   # warning occurred, but no problem
    mutate_all(~ replace_na(.x, ""))
#    a        b        c      
#    <chr>    <chr>    <chr>  
#  1 ""       ""       dates  
#  2 "names:" "types:" types2:
#  3 "data_1" "A"      W      
#  4 "data_1" "A"      X      
#  5 "data_1" "A"      Y      
#  6 "data_1" "A"      Z      
#  7 "data_1" "B"      W      
  
  group_ind <- ind_d %>% 
    mutate_all(~ cumsum(.x != lag(.x, default = "xxxxx")))
#        a     b     c
#    <int> <int> <int>
#  1     1     1     1
#  2     2     2     2
#  3     3     3     3
#  4     3     3     4
#  5     3     3     5
#  6     3     3     6
#  7     3     4     7
  
  sapply(1:ncol(ind_d), function(x){
    tibble(ind = ind_d[[x]], g_ind = group_ind[[x]]) %>% 
      count(g_ind, ind) %>% 
      select(-g_ind) %>% 
      as.list()
  }, simplify = FALSE)
}

header_info <- header_info_maker(base_data3)

making table

# convert df to flextable and delete origin pasted header.
ft <- flextable(base_data3) %>% delete_part(part = "header")
# add header
for(i in length(header_info):1){
  ft <- add_header_row(ft, colwidths = header_info[[i]]$n, values = header_info[[i]]$ind)
}
# change design
ft <- ft %>% 
  theme_vanilla() %>% 
  vline(j = 1, border = fp_border(width = 2)) %>% 
  align(align = "center", part = "all") %>% 
  hline(i = c(2,3), border = fp_border(width = 2))
ft

enter image description here