Matching a sequence in a larger vector
I'd like to have a function which returns the initial indicies of matching subsequences of a vector. For example:
y <- c("a","a","a","b","c")
multi_match(c("a","a"), y)
# [1] 1 2
multi_match(c("a","b"), y)
# [1] 3
I have a rough implementation, but I feel like I must be reinventing the wheel, and it's a little clunky. Is there a better way to implement this, or is there a pre-existing function somewhere with similar functionality?
multi_match <- function(x, table){
# returns initial indicies of all substrings in table which match x
if(length(table) < length(x)){
return(NA)
}else{
check_mat <- matrix(nrow = length(x), ncol = length(table))
for(i in 1:length(x)){
check_mat[i,] <- table %in% x[i]
}
out <- vector(length = length(table))
for(i in 1:(length(table)-(length(x)-1))){
check <- vector(length=length(x))
for(j in 1:length(x)){
check[j] <- check_mat[j,(i+(j-1))]
}
out[i] <- all(check)
}
if(length(which(out))==0){
return(NA)
}else{
return(which(out))
}
}
}
Solution 1:
Try rollapply
in zoo:
library(zoo)
which(rollapply(y, 2, identical, c("a", "a")))
## [1] 1 2
which(rollapply(y, 2, identical, c("a", "b")))
## [1] 3
Solution 2:
set.seed(0)
a <- sample(1:6,12000, TRUE)
b <- 2:4
vecIn <- function(a,b){
which(
Reduce('+', lapply(seq_along(y <- lapply(b, '==', a)), function(x){
y[[x]][x:(length(a) - length(b) +x)]
}
)
) == length(b)
)
}
> vecIn(a,b)
[1] 2 154 986 1037 1046 1257 1266 1750 2375 2677 3184 3206
[13] 3499 3526 3882 4238 4311 4388 4437 4580 4714 4766 4827 5046
[25] 5279 5629 6153 6842 6856 6919 7200 7516 7520 7707 7824 7859
[37] 8140 8191 8687 9208 9281 9313 10022 10320 10617 10720 10958 11179
[49] 11567 11591 11698 11811
library(zoo)
library(rbenchmark)
func1 <- function(a,b){
gregexpr(paste0(b,collapse=""),paste0(a,collapse=""))
}
func2 <- function(a,b){
which(rollapply(a, length(b), identical, b))
}
func3 <- vecIn
Some benchmarks
benchmark(func1(a,b), func2(a,b), func3(a,b))
test replications elapsed relative user.self sys.self user.child
1 func1(a, b) 100 0.673 5.904 0.680 0.000 0
2 func2(a, b) 100 28.808 252.702 28.198 0.672 0
3 func3(a, b) 100 0.114 1.000 0.116 0.000 0
sys.child
1 0
2 0
3 0