R semicolon delimited a column into rows
Here is a base R solution. Split the PolId
field using strplit
and for each such split field cbind it with the corresponding Description
. This gives a list of matrices which we rbind
together. Finally set the column names.
out <- do.call(rbind, Map(cbind, strsplit(DF$PolId, ";"), DF$Description))
colnames(out) <- colnames(DF)
giving:
> out
PolId Description
[1,] "ABC123" "TEST1"
[2,] "ABC456" "TEST1"
[3,] "ABC789" "TEST1"
[4,] "ABC123" "TEST1"
[5,] "ABC456" "TEST1"
[6,] "ABC789" "TEST1"
[7,] "ABC123" "TEST1"
[8,] "ABC456" "TEST1"
[9,] "ABC789" "TEST1"
[10,] "AAA123" "TEST1"
[11,] "AAA123" "TEST2"
[12,] "ABB123" "TEST3"
[13,] "ABC123" "TEST3"
[14,] "ABB123" "TEST3"
[15,] "ABC123" "TEST3"
Note: We used this as the input:
DF <-
structure(list(PolId = c("ABC123;ABC456;ABC789;", "ABC123;ABC456;ABC789;",
"ABC123;ABC456;ABC789;", "AAA123;", "AAA123;", "ABB123;ABC123;",
"ABB123;ABC123;"), Description = c("TEST1", "TEST1", "TEST1",
"TEST1", "TEST2", "TEST3", "TEST3")), .Names = c("PolId", "Description"
), class = "data.frame", row.names = c(NA, -7L))
Here's a quick data.table
possible solution
library(data.table)
unique(setDT(df)[, .(PolId = unlist(strsplit(as.character(PolId), ";"))), by = Description])
# Description PolId
# 1: TEST1 ABC123
# 2: TEST1 ABC456
# 3: TEST1 ABC789
# 4: TEST1 AAA123
# 5: TEST2 AAA123
# 6: TEST3 ABB123
# 7: TEST3 ABC123
Per your edit- Another option (in case you have more than two columns)
library(splitstackshape)
unique(cSplit(df, "PolId", ";", "long"))
# PolId Description Document.Type
# 1: ABC123 TEST1 Pol1
# 2: ABC456 TEST1 Pol1
# 3: ABC789 TEST1 Pol1
# 4: AAA123 TEST1 End1
# 5: AAA123 TEST2 End2
# 6: ABB123 TEST3 End1
# 7: ABC123 TEST3 End1
You could try unnest
from tidyr
after splitting the "PolId" column and get the unique
rows
library(dplyr)
library(tidyr)
unnest(setNames(strsplit(df$PolId, ';'), df$Description),
Description) %>% unique()
Or using base R
with stack/strsplit/duplicated
. Split the "PolId" (strsplit
) by the delimiter(;
), name the output list elements with "Description" column, stack
the list to get a 'data.frame' and use duplicated
to remove the duplicate rows.
df1 <- stack(setNames(strsplit(df$PolId, ';'), df$Description))
setNames(df1[!duplicated(df1),], names(df))
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#10 AAA123 TEST1
#11 AAA123 TEST2
#12 ABB123 TEST3
#13 ABC123 TEST3
Or another option without using strsplit
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\\b\\S+\\b)(?=.*\\b\\1\\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#4 AAA123 TEST1
#5 AAA123 TEST2
#6 ABB123 TEST3
#7 ABC123 TEST3