The most efficient way to check if all values in a row are the same or are missing
Solution 1:
In Julia using a matrix for such operation, similarly to R, is also natural:
julia> using BenchmarkTools
julia> function helper(x)
nonempty = false
local fv
for v in x
if !ismissing(v)
if nonempty
v == fv || return false
else
fv = v
nonempty = true
end
end
end
return true
end
helper (generic function with 1 method)
julia> mat = rand([1, 2, missing], 3_000_000, 303);
julia> @benchmark helper.(eachrow($mat))
BenchmarkTools.Trial: 34 samples with 1 evaluation.
Range (min … max): 139.440 ms … 154.628 ms ┊ GC (min … max): 5.74% … 5.15%
Time (median): 147.890 ms ┊ GC (median): 5.27%
Time (mean ± σ): 147.876 ms ± 3.114 ms ┊ GC (mean ± σ): 5.23% ± 0.95%
▃ ▃ ▃ ▃ ██ ▃
▇▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▁█▁▁▁▁█▇▇▇█▁▁█▇▁██▇█▁▇▁▇▇▁▇▇▁▇▁▇▇▇▁▁▁▁▇▁▁▁▁▇ ▁
139 ms Histogram: frequency by time 155 ms <
Memory estimate: 114.80 MiB, allocs estimate: 6.
The operation can be also done in DataFrames.jl, here is an example how to do it:
julia> function helper2(x, i)
nonempty = false
local fv
for v in x
vv = v[i]
if !ismissing(vv)
if nonempty
vv == fv || return false
else
fv = vv
nonempty = true
end
end
end
return true
end
helper2 (generic function with 1 method)
julia> df = DataFrame(mat, :auto, copycols=false); # copycols to avoid copying as data is large
julia> @benchmark helper2.(Ref(identity.(eachcol($df))), 1:nrow($df))
BenchmarkTools.Trial: 46 samples with 1 evaluation.
Range (min … max): 105.265 ms … 123.345 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 110.682 ms ┊ GC (median): 0.00%
Time (mean ± σ): 110.581 ms ± 2.692 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
▄ ▂ ▄█▂
▄▁▁▄▄▁▁▆▄▁▁▁▆▆█▄█▆███▄▄▁█▄▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
105 ms Histogram: frequency by time 123 ms <
Memory estimate: 385.28 KiB, allocs estimate: 15.
(if anything in the code is not clear please let me know and I can explain)
EDIT
If you a small union of unique eltypes
do:
helper2.(Ref(collect(Union{unique(typeof.(eachcol(df)))...}, eachcol(df))), 1:nrow(df))
If Union{unique(typeof.(eachcol(df)))...}
is not a small collection then another solution would be better so please comment if this is good enough for you.
Solution 2:
Structure matters. Here is a matrix approach in R, using the matrixStats
package (source), which ships optimized matrix functions implemented in C.
x = sample.int(3, size = 303*3e6, replace = T)
m = matrix(x, ncol = 303, byrow = T)
bench::mark(
var = matrixStats::rowVars(m, na.rm = T) == 0
)
On my (certainly not high performance) machine this takes roughly 3.5 seconds for a 3 million row matrix.
Solution 3:
Do you have about 350MB of RAM available? If so, you can try this function
rowequal <- function(x) {
undetermined <- function(x, can_del) {
if (length(can_del) < 1L)
return(x)
x[-can_del]
}
if (ncol(x) < 1L)
return(logical())
out <- logical(nrow(x))
if (ncol(x) < 2L)
return(!out)
x1 <- x[[1L]]
need_compare <- undetermined(seq_len(nrow(x)), which(x1 != x[[2L]]))
x1[nas] <- x[[2L]][nas <- which(is.na(x1))]
if (ncol(x) > 2L) {
for (j in 3:ncol(x)) {
need_compare <- undetermined(
need_compare, which(x1[need_compare] != x[[j]][need_compare])
)
x1[nas] <- x[[j]][nas <- which(is.na(x1))]
if (length(need_compare) < 1L)
return(out)
}
}
`[<-`(out, need_compare, TRUE)
}
Below is the benchmark
> dim(d)
[1] 3000000 300
> bench::mark(f(d), rowequal(d), iterations = 10)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <lis>
1 f(d) 2.97s 2.98s 0.335 34.4MB 0 10 0 29.8s <lgl ~ <Rpro~ <ben~ <tib~
2 rowequal(d) 88.52ms 93.34ms 10.7 352.2MB 0 10 0 932.5ms <lgl ~ <Rpro~ <ben~ <tib~
, where f
(I got this from this post) and d
are
f <- function(x) {
v1 = do.call(pmin, c(x, na.rm = TRUE))
v2 = do.call(pmax, c(x, na.rm = TRUE))
v1 == v2
}
mkDT <- function(rows, cols, type = as.integer) {
data.table::setDT(
replicate(cols, sample(type(c(1:10, NA)), rows, replace = TRUE), simplify = FALSE)
)
}
d <- mkDT(3e6, 300)
An Rcpp version of the code. It could achieve its best performance (in terms of both memory usage and speed) if you can ensure that all the columns in your dataframe are of the numeric
type. I guess this is the most efficient solution to your problem (in R).
rowequalcpp <- function(x) {
if (ncol(x) < 1L)
return(logical())
out <- logical(nrow(x))
if (ncol(x) < 2L)
return(!out)
mark_equal(out, x)
out
}
Rcpp::sourceCpp(code = '
#include <Rcpp.h>
// [[Rcpp::export]]
void mark_equal(Rcpp::LogicalVector& res, const Rcpp::DataFrame& df) {
Rcpp::NumericVector x1 = df[0];
int n = df.nrows();
int need_compare[n];
for (int i = 0; i < n; ++i)
need_compare[i] = i;
for (int j = 1; j < df.length(); ++j) {
Rcpp::NumericVector dfj = df[j];
for (int i = 0; i < n; ) {
int itmp = need_compare[i];
if (Rcpp::NumericVector::is_na(x1[itmp]))
x1[itmp] = dfj[itmp];
if (!Rcpp::NumericVector::is_na(dfj[itmp]) && x1[itmp] != dfj[itmp]) {
need_compare[i] = need_compare[n - 1];
need_compare[n-- - 1] = itmp;
} else
++i;
}
if (n < 1)
return;
}
for (int i = 0; i < n; ++i)
res[need_compare[i]] = TRUE;
}
')
Benchmark (the type of your columns are crucial for the performance):
> d <- mkDT(3000000, 300, as.integer)
> bench::mark(rowequal(d), rowequalcpp(d), iterations = 10)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <lis>
1 rowequal(d) 100ms 147ms 7.07 398MB 3.03 7 3 991ms <lgl ~ <Rpro~ <ben~ <tib~
2 rowequalcpp(d) 101ms 102ms 9.35 309MB 2.34 8 2 855ms <lgl ~ <Rpro~ <ben~ <tib~
> d <- mkDT(3000000, 300, as.numeric)
> bench::mark(rowequal(d), rowequalcpp(d), iterations = 10)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis>
1 rowequal(d) 103.7ms 110.8ms 8.05 349.3MB 0.895 9 1 1.12s <lgl [~ <Rprofm~ <ben~
2 rowequalcpp(d) 26.3ms 27.3ms 36.3 11.4MB 0 10 0 275.2ms <lgl [~ <Rprofm~ <ben~
# ... with 1 more variable: gc <list>