The most efficient way to check if all values in a row are the same or are missing

Solution 1:

In Julia using a matrix for such operation, similarly to R, is also natural:

julia> using BenchmarkTools

julia> function helper(x)
           nonempty = false
           local fv
           for v in x
               if !ismissing(v)
                   if nonempty
                       v == fv || return false
                   else
                       fv = v
                       nonempty = true
                   end
               end
           end
           return true
       end
helper (generic function with 1 method)

julia> mat = rand([1, 2, missing], 3_000_000, 303);

julia> @benchmark helper.(eachrow($mat))
BenchmarkTools.Trial: 34 samples with 1 evaluation.
 Range (min … max):  139.440 ms … 154.628 ms  ┊ GC (min … max): 5.74% … 5.15%
 Time  (median):     147.890 ms               ┊ GC (median):    5.27%        
 Time  (mean ± σ):   147.876 ms ±   3.114 ms  ┊ GC (mean ± σ):  5.23% ± 0.95%

                    ▃    ▃   ▃  ▃  ██ ▃
  ▇▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▁█▁▁▁▁█▇▇▇█▁▁█▇▁██▇█▁▇▁▇▇▁▇▇▁▇▁▇▇▇▁▁▁▁▇▁▁▁▁▇ ▁
  139 ms           Histogram: frequency by time          155 ms <

 Memory estimate: 114.80 MiB, allocs estimate: 6.

The operation can be also done in DataFrames.jl, here is an example how to do it:

julia> function helper2(x, i)
           nonempty = false
           local fv
           for v in x
               vv = v[i]
               if !ismissing(vv)
                   if nonempty
                       vv == fv || return false
                   else
                       fv = vv
                       nonempty = true
                   end
               end
           end
           return true
       end
helper2 (generic function with 1 method)

julia> df = DataFrame(mat, :auto, copycols=false); # copycols to avoid copying as data is large

julia> @benchmark helper2.(Ref(identity.(eachcol($df))), 1:nrow($df))
BenchmarkTools.Trial: 46 samples with 1 evaluation.
 Range (min … max):  105.265 ms … 123.345 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     110.682 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   110.581 ms ±   2.692 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

                ▄ ▂ ▄█▂
  ▄▁▁▄▄▁▁▆▄▁▁▁▆▆█▄█▆███▄▄▁█▄▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
  105 ms           Histogram: frequency by time          123 ms <

 Memory estimate: 385.28 KiB, allocs estimate: 15.

(if anything in the code is not clear please let me know and I can explain)


EDIT

If you a small union of unique eltypes do:

helper2.(Ref(collect(Union{unique(typeof.(eachcol(df)))...}, eachcol(df))), 1:nrow(df))

If Union{unique(typeof.(eachcol(df)))...} is not a small collection then another solution would be better so please comment if this is good enough for you.

Solution 2:

Structure matters. Here is a matrix approach in R, using the matrixStats package (source), which ships optimized matrix functions implemented in C.

x = sample.int(3, size = 303*3e6, replace = T)            
m = matrix(x, ncol = 303, byrow = T)
bench::mark(
  var = matrixStats::rowVars(m, na.rm = T) == 0
)

On my (certainly not high performance) machine this takes roughly 3.5 seconds for a 3 million row matrix.

Solution 3:

Do you have about 350MB of RAM available? If so, you can try this function

rowequal <- function(x) {
  undetermined <- function(x, can_del) {
    if (length(can_del) < 1L)
      return(x)
    x[-can_del]
  }
  if (ncol(x) < 1L)
    return(logical())
  out <- logical(nrow(x))
  if (ncol(x) < 2L)
    return(!out)
  x1 <- x[[1L]]
  need_compare <- undetermined(seq_len(nrow(x)), which(x1 != x[[2L]]))
  x1[nas] <- x[[2L]][nas <- which(is.na(x1))]
  if (ncol(x) > 2L) {
    for (j in 3:ncol(x)) {
      need_compare <- undetermined(
        need_compare, which(x1[need_compare] != x[[j]][need_compare])
      )
      x1[nas] <- x[[j]][nas <- which(is.na(x1))]
      if (length(need_compare) < 1L)
        return(out)
    }
  }
  `[<-`(out, need_compare, TRUE)
}

Below is the benchmark

> dim(d)
[1] 3000000     300
> bench::mark(f(d), rowequal(d), iterations = 10)
# A tibble: 2 x 13
  expression       min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory time  gc   
  <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list> <lis> <lis>
1 f(d)           2.97s    2.98s     0.335    34.4MB        0    10     0      29.8s <lgl ~ <Rpro~ <ben~ <tib~
2 rowequal(d)  88.52ms  93.34ms    10.7     352.2MB        0    10     0    932.5ms <lgl ~ <Rpro~ <ben~ <tib~

, where f (I got this from this post) and d are

f <- function(x) {
  v1 = do.call(pmin, c(x, na.rm = TRUE))
  v2 = do.call(pmax, c(x, na.rm = TRUE))
  v1 == v2
}

mkDT <- function(rows, cols, type = as.integer) {
  data.table::setDT(
    replicate(cols, sample(type(c(1:10, NA)), rows, replace = TRUE), simplify = FALSE)
  )
}

d <- mkDT(3e6, 300)

An Rcpp version of the code. It could achieve its best performance (in terms of both memory usage and speed) if you can ensure that all the columns in your dataframe are of the numeric type. I guess this is the most efficient solution to your problem (in R).

rowequalcpp <- function(x) {
  if (ncol(x) < 1L)
    return(logical())
  out <- logical(nrow(x))
  if (ncol(x) < 2L)
    return(!out)
  mark_equal(out, x)
  out
}

Rcpp::sourceCpp(code = '
#include <Rcpp.h>

// [[Rcpp::export]]
void mark_equal(Rcpp::LogicalVector& res, const Rcpp::DataFrame& df) {
  Rcpp::NumericVector x1 = df[0];
  int n = df.nrows();
  int need_compare[n];
  for (int i = 0; i < n; ++i)
    need_compare[i] = i;
  for (int j = 1; j < df.length(); ++j) {
    Rcpp::NumericVector dfj = df[j];
    for (int i = 0; i < n; ) {
      int itmp = need_compare[i];
      if (Rcpp::NumericVector::is_na(x1[itmp]))
        x1[itmp] = dfj[itmp];
      if (!Rcpp::NumericVector::is_na(dfj[itmp]) && x1[itmp] != dfj[itmp]) {
        need_compare[i] = need_compare[n - 1];
        need_compare[n-- - 1] = itmp;
      } else
        ++i;
    }
    if (n < 1)
      return;
  }
  for (int i = 0; i < n; ++i)
    res[need_compare[i]] = TRUE;
}
')

Benchmark (the type of your columns are crucial for the performance):

> d <- mkDT(3000000, 300, as.integer)
> bench::mark(rowequal(d), rowequalcpp(d), iterations = 10)
# A tibble: 2 x 13
  expression        min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory time  gc   
  <bch:expr>     <bch:> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list> <lis> <lis>
1 rowequal(d)     100ms   147ms      7.07     398MB     3.03     7     3      991ms <lgl ~ <Rpro~ <ben~ <tib~
2 rowequalcpp(d)  101ms   102ms      9.35     309MB     2.34     8     2      855ms <lgl ~ <Rpro~ <ben~ <tib~
> d <- mkDT(3000000, 300, as.numeric)
> bench::mark(rowequal(d), rowequalcpp(d), iterations = 10)
# A tibble: 2 x 13
  expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result  memory   time 
  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>  <list>   <lis>
1 rowequal(d)     103.7ms  110.8ms      8.05   349.3MB    0.895     9     1      1.12s <lgl [~ <Rprofm~ <ben~
2 rowequalcpp(d)   26.3ms   27.3ms     36.3     11.4MB    0        10     0    275.2ms <lgl [~ <Rprofm~ <ben~
# ... with 1 more variable: gc <list>