Reorganizing and collapsing many csv's in R

Here's an elegant solution in the tidyverse, without any cumbersome looping. It assumes that all the variables {un01, ..., un05, pm01, ..., pm05, ..., tr01, ..., tr05} exist as data.frames in your workspace; though it could be easily modified to accommodate another situation.

Warning

Be sure to use version 1.3.1 (or later) of the tidyverse; and especially version 1.1.0 (or later) of the tidyr package.

Solution

# Use the tidyverse.
library(tidyverse)

First create the standardize_csv() function like so:

# Function to transform CSVs into a standard form.
standardize_csv <- function(csv_data, csv_prefix){
  # Turn the word columns into a single column.
  csv_data %>% pivot_longer(
    # The word columns are every column except the 1st, which names the metric.
    cols = !1,
    names_to = "word"
  ) %>%
    # Include the CSV prefix for reference.
    mutate(
      csv_prefix = csv_prefix
    ) %>%
    # Rename and reformat the dataset.
    select(
      csv_prefix,
      word,
      metric = 1,
      value
    )
}

Then construct a named list() of all your CSV datasets, where the names follow your convention: "un01", ..., "un05", ..., "tr01", ..., "tr05". Here, we get() those datasets from your workspace.

# Find all CSV variables named by your convention, within your workspace.
csv_list <- sapply(
  X = ls(pattern = "^[a-z]{2,2}0[1-5]$"),
  FUN = get,
  simplify = FALSE,
  USE.NAMES = TRUE
)

Armed with standardize_csv(), you can do the rest in a single workflow:

# Consolidate into the final form:
final_df <- csv_list %>%
  mapply(
    # Transform into standard form...
    FUN = standardize_csv,
    # ...the CSV datasets...
    csv_data = .,
    # ...with their prefixes.
    csv_prefix = gsub(
      x = names(.),
      pattern = "^([a-z]*)(\\d*)$",
      replacement = "\\1"
    ),
    
    SIMPLIFY = FALSE
  ) %>%
  # Stack those standardized datasets into a single table.
  do.call(
    what = bind_rows
  ) %>%
  # Pivot the table to have one column for each family and metric ('un_mean',
  # 'pm_SD', etc.).
  pivot_wider(
    names_from = c(csv_prefix, metric),
    names_glue = "{csv_prefix}_{metric}",
    values_from = value
  )

# View the result.
final_df

Result

Given the CSV variables {un01, un05, pm01} that you sampled

un01 <- structure(
  list(
    ...1 = c("mean", "n", "SD"),
    hill = c(2.26315789473684, 38, 2.0623289187203),
    user = c(1.13157894736842, 38, 1.57978877533674),
    name = c(2.42105263157895, 38, 1.96773403700762),
    paint = c(4.18421052631579, 38, 1.81369062527503),
    smile = c(6.05263157894737, 38, 1.98572430003074),
    back = c(4.10526315789474, 38, 2.0767629916461),
    mouse = c(7.35135135135135, 38, 1.75144727798686),
    potato = c(5.94736842105263, 38, 2.01276156159657),
    pain = c(1.34210526315789, 38, 1.69699759972873),
    life = c(4.05263157894737, 38, 1.99928863593695)
  ),
  row.names = c(NA, -3L),
  class = c("tbl_df", "tbl", "data.frame")
)

un05 <- structure(
  list(
    ...1 = c("mean", "n", "SD"),
    house = c(5.08823529411765, 34, 1.65817802638964),
    person = c(4.11764705882353, 34, 2.15694376772143),
    city = c(2.47058823529412, 34, 1.70978102651927),
    agressive = c(5, 34, 1.95401684183679)
  ),
  row.names = c(NA, -3L),
  class = c("tbl_df", "tbl", "data.frame")
)

pm01 <- structure(
  list(
    ...1 = c("mean", "n", "SD"),
    hill = c(5.3030303030303, 33, 1.64857606142484),
    user = c(4.24242424242424, 33, 1.96898297082375),
    name = c(7.18181818181818, 33, 1.18465568453838),
    paint = c(3.27272727272727, 33, 1.71887912738082),
    smile = c(1.15151515151515, 33, 4.8728523053289),
    back = c(4.3030303030303, 33, 2.02306772756204),
    mouse = c(7.06060606060606, 33, 1.57994054356869),
    potato = c(3.34666666666667, 33, 1.36289077492212),
    pain = c(4.17666666666667, 33, 1.25762044965971),
    life = c(4.22121212121212, 33, 1.40884650861919)),
  row.names = c(NA, -3L),
  class = c("tbl_df", "tbl", "data.frame")
)

the result for final_df should look like this:

# A tibble: 14 x 7
   word      pm_mean  pm_n pm_SD un_mean  un_n un_SD
   <chr>       <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
 1 hill         5.30    33  1.65    2.26    38  2.06
 2 user         4.24    33  1.97    1.13    38  1.58
 3 name         7.18    33  1.18    2.42    38  1.97
 4 paint        3.27    33  1.72    4.18    38  1.81
 5 smile        1.15    33  4.87    6.05    38  1.99
 6 back         4.30    33  2.02    4.11    38  2.08
 7 mouse        7.06    33  1.58    7.35    38  1.75
 8 potato       3.35    33  1.36    5.95    38  2.01
 9 pain         4.18    33  1.26    1.34    38  1.70
10 life         4.22    33  1.41    4.05    38  2.00
11 house       NA       NA NA       5.09    34  1.66
12 person      NA       NA NA       4.12    34  2.16
13 city        NA       NA NA       2.47    34  1.71
14 agressive   NA       NA NA       5       34  1.95

Note

Since no pm_05 was available, there are "blanks" (NAs) for the corresponding words under the pm_* columns. This illustrates how missing CSVs are handled, but as long as you provide all the CSV variables, there should be no blanks in your full result.

I think one good starting point could be to load all needed files in a list and all names of the files in another list or vector, e.g. something like

names2use <- list.files(path2direcitonwherefilesare)
fileList <- lapply(names2use, function(x) { read.csv(file.path(path2direcitonwherefilesare, x), ...)

Then you can create a matrix/data.frame an fill it up, e.g.

#my dummy data set
un01 <- data.frame(types=c("mean", "sd"), hill=c(0.1,0.2), boat=c(0.1,0.2))
un02 <- data.frame(types=c("mean", "sd"), trip=c(0.1,0.2), clip=c(0.1,0.2))
jp01 <- data.frame(types=c("mean", "sd"), hill=c(0.1,0.2), boat=c(0.1,0.2))
fileList <- list(un01, un02, jp01)
filenames <- c("un01", "un02", "jp01")

types <- unique(substr(filenames, start = 1, stop = 2))
infos <- as.character(un01[,1])
names2use <- c(paste(types, sort(rep(infos, length(types))), sep="_"))
rows <- unique(unlist(lapply(fileList, function(x) {names(x)})))[-1]

#creating empty an data.frame
data <- as.data.frame(matrix(NA, nrow=length(cols), ncol=length(names2use)+1))
data[,1] <- rows
names(data) <- c("word", sort(names2use))

i <- 1
for (file in fileList) {
  filename <- substr(filenames[[i]],1,2)
  for (n in 2:length(names(file))) {
    data[data$word==names(file)[n], substr(names(data),1,2) %in% filename] <- file[[names(file)[n]]]
  }
  i <- i + 1
}

> data
  word jp_mean jp_sd un_mean un_sd
1 hill     0.1   0.2     0.1   0.2
2 boat     0.1   0.2     0.1   0.2
3 trip      NA    NA     0.1   0.2
4 clip      NA    NA     0.1   0.2

Reorganizing and collapsing many csv's in R

Warning

Solution

Result

Note

Related

Recent Posts