Reorganizing and collapsing many csv's in R
Here's an elegant solution in the tidyverse
, without any cumbersome looping. It assumes that all the variables {un01
, ..., un05
, pm01
, ..., pm05
, ..., tr01
, ..., tr05
} exist as data.frame
s in your workspace; though it could be easily modified to accommodate another situation.
Warning
Be sure to use version 1.3.1 (or later) of the tidyverse
; and especially version 1.1.0 (or later) of the tidyr
package.
Solution
# Use the tidyverse.
library(tidyverse)
First create the standardize_csv()
function like so:
# Function to transform CSVs into a standard form.
standardize_csv <- function(csv_data, csv_prefix){
# Turn the word columns into a single column.
csv_data %>% pivot_longer(
# The word columns are every column except the 1st, which names the metric.
cols = !1,
names_to = "word"
) %>%
# Include the CSV prefix for reference.
mutate(
csv_prefix = csv_prefix
) %>%
# Rename and reformat the dataset.
select(
csv_prefix,
word,
metric = 1,
value
)
}
Then construct a named list()
of all your CSV datasets, where the names follow your convention: "un01"
, ..., "un05"
, ..., "tr01"
, ..., "tr05"
. Here, we get()
those datasets from your workspace.
# Find all CSV variables named by your convention, within your workspace.
csv_list <- sapply(
X = ls(pattern = "^[a-z]{2,2}0[1-5]$"),
FUN = get,
simplify = FALSE,
USE.NAMES = TRUE
)
Armed with standardize_csv()
, you can do the rest in a single workflow:
# Consolidate into the final form:
final_df <- csv_list %>%
mapply(
# Transform into standard form...
FUN = standardize_csv,
# ...the CSV datasets...
csv_data = .,
# ...with their prefixes.
csv_prefix = gsub(
x = names(.),
pattern = "^([a-z]*)(\\d*)$",
replacement = "\\1"
),
SIMPLIFY = FALSE
) %>%
# Stack those standardized datasets into a single table.
do.call(
what = bind_rows
) %>%
# Pivot the table to have one column for each family and metric ('un_mean',
# 'pm_SD', etc.).
pivot_wider(
names_from = c(csv_prefix, metric),
names_glue = "{csv_prefix}_{metric}",
values_from = value
)
# View the result.
final_df
Result
Given the CSV variables {un01
, un05
, pm01
} that you sampled
un01 <- structure(
list(
...1 = c("mean", "n", "SD"),
hill = c(2.26315789473684, 38, 2.0623289187203),
user = c(1.13157894736842, 38, 1.57978877533674),
name = c(2.42105263157895, 38, 1.96773403700762),
paint = c(4.18421052631579, 38, 1.81369062527503),
smile = c(6.05263157894737, 38, 1.98572430003074),
back = c(4.10526315789474, 38, 2.0767629916461),
mouse = c(7.35135135135135, 38, 1.75144727798686),
potato = c(5.94736842105263, 38, 2.01276156159657),
pain = c(1.34210526315789, 38, 1.69699759972873),
life = c(4.05263157894737, 38, 1.99928863593695)
),
row.names = c(NA, -3L),
class = c("tbl_df", "tbl", "data.frame")
)
un05 <- structure(
list(
...1 = c("mean", "n", "SD"),
house = c(5.08823529411765, 34, 1.65817802638964),
person = c(4.11764705882353, 34, 2.15694376772143),
city = c(2.47058823529412, 34, 1.70978102651927),
agressive = c(5, 34, 1.95401684183679)
),
row.names = c(NA, -3L),
class = c("tbl_df", "tbl", "data.frame")
)
pm01 <- structure(
list(
...1 = c("mean", "n", "SD"),
hill = c(5.3030303030303, 33, 1.64857606142484),
user = c(4.24242424242424, 33, 1.96898297082375),
name = c(7.18181818181818, 33, 1.18465568453838),
paint = c(3.27272727272727, 33, 1.71887912738082),
smile = c(1.15151515151515, 33, 4.8728523053289),
back = c(4.3030303030303, 33, 2.02306772756204),
mouse = c(7.06060606060606, 33, 1.57994054356869),
potato = c(3.34666666666667, 33, 1.36289077492212),
pain = c(4.17666666666667, 33, 1.25762044965971),
life = c(4.22121212121212, 33, 1.40884650861919)),
row.names = c(NA, -3L),
class = c("tbl_df", "tbl", "data.frame")
)
the result for final_df
should look like this:
# A tibble: 14 x 7
word pm_mean pm_n pm_SD un_mean un_n un_SD
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 hill 5.30 33 1.65 2.26 38 2.06
2 user 4.24 33 1.97 1.13 38 1.58
3 name 7.18 33 1.18 2.42 38 1.97
4 paint 3.27 33 1.72 4.18 38 1.81
5 smile 1.15 33 4.87 6.05 38 1.99
6 back 4.30 33 2.02 4.11 38 2.08
7 mouse 7.06 33 1.58 7.35 38 1.75
8 potato 3.35 33 1.36 5.95 38 2.01
9 pain 4.18 33 1.26 1.34 38 1.70
10 life 4.22 33 1.41 4.05 38 2.00
11 house NA NA NA 5.09 34 1.66
12 person NA NA NA 4.12 34 2.16
13 city NA NA NA 2.47 34 1.71
14 agressive NA NA NA 5 34 1.95
Note
Since no pm_05
was available, there are "blanks" (NA
s) for the corresponding words under the pm_*
columns. This illustrates how missing CSVs are handled, but as long as you provide all the CSV variables, there should be no blanks in your full result.
I think one good starting point could be to load all needed files in a list and all names of the files in another list or vector, e.g. something like
names2use <- list.files(path2direcitonwherefilesare)
fileList <- lapply(names2use, function(x) { read.csv(file.path(path2direcitonwherefilesare, x), ...)
Then you can create a matrix/data.frame an fill it up, e.g.
#my dummy data set
un01 <- data.frame(types=c("mean", "sd"), hill=c(0.1,0.2), boat=c(0.1,0.2))
un02 <- data.frame(types=c("mean", "sd"), trip=c(0.1,0.2), clip=c(0.1,0.2))
jp01 <- data.frame(types=c("mean", "sd"), hill=c(0.1,0.2), boat=c(0.1,0.2))
fileList <- list(un01, un02, jp01)
filenames <- c("un01", "un02", "jp01")
types <- unique(substr(filenames, start = 1, stop = 2))
infos <- as.character(un01[,1])
names2use <- c(paste(types, sort(rep(infos, length(types))), sep="_"))
rows <- unique(unlist(lapply(fileList, function(x) {names(x)})))[-1]
#creating empty an data.frame
data <- as.data.frame(matrix(NA, nrow=length(cols), ncol=length(names2use)+1))
data[,1] <- rows
names(data) <- c("word", sort(names2use))
i <- 1
for (file in fileList) {
filename <- substr(filenames[[i]],1,2)
for (n in 2:length(names(file))) {
data[data$word==names(file)[n], substr(names(data),1,2) %in% filename] <- file[[names(file)[n]]]
}
i <- i + 1
}
> data
word jp_mean jp_sd un_mean un_sd
1 hill 0.1 0.2 0.1 0.2
2 boat 0.1 0.2 0.1 0.2
3 trip NA NA 0.1 0.2
4 clip NA NA 0.1 0.2