What is the fastest or most elegant way to compute a set difference using Javascript arrays?

Let A and B be two sets. I'm looking for really fast or elegant ways to compute the set difference (A - B or A \B, depending on your preference) between them. The two sets are stored and manipulated as Javascript arrays, as the title says.

Notes:

  • Gecko-specific tricks are okay
  • I'd prefer sticking to native functions (but I am open to a lightweight library if it's way faster)
  • I've seen, but not tested, JS.Set (see previous point)

Edit: I noticed a comment about sets containing duplicate elements. When I say "set" I'm referring to the mathematical definition, which means (among other things) that they do not contain duplicate elements.


if don't know if this is most effective, but perhaps the shortest

A = [1, 2, 3, 4];
B = [1, 3, 4, 7];

diff = A.filter(function(x) { return B.indexOf(x) < 0 })

console.log(diff);

Updated to ES6:

A = [1, 2, 3, 4];
B = [1, 3, 4, 7];

diff = A.filter(x => !B.includes(x) );

console.log(diff);

Well, 7 years later, with ES6's Set object it's quite easy (but still not as compact as python's A - B), and reportedly faster than indexOf for large arrays:

console.clear();
let a = new Set([1, 2, 3, 4]);
let b = new Set([5, 4, 3, 2]);


let a_minus_b = new Set([...a].filter(x => !b.has(x)));
let b_minus_a = new Set([...b].filter(x => !a.has(x)));
let a_intersect_b = new Set([...a].filter(x => b.has(x))); 

console.log([...a_minus_b]) // {1}
console.log([...b_minus_a]) // {5}
console.log([...a_intersect_b]) // {2,3,4}

You can use an object as a map to avoid linearly scanning B for each element of A as in user187291's answer:

function setMinus(A, B) {
    var map = {}, C = [];

    for(var i = B.length; i--; )
        map[B[i].toSource()] = null; // any other value would do

    for(var i = A.length; i--; ) {
        if(!map.hasOwnProperty(A[i].toSource()))
            C.push(A[i]);
    }

    return C;
}

The non-standard toSource() method is used to get unique property names; if all elements already have unique string representations (as is the case with numbers), you can speed up the code by dropping the toSource() invocations.


Looking at a lof of these solutions, they do fine for small cases. But, when you blow them up to a million items, the time complexity starts getting silly.

 A.filter(v => B.includes(v))

That starts looking like an O(N^2) solution. Since there is an O(N) solution, let's use it, you can easily modify to not be a generator if you're not up to date on your JS runtime.

    function *setMinus(A, B) {
      const setA = new Set(A);
      const setB = new Set(B);

      for (const v of setB.values()) {
        if (!setA.delete(v)) {
            yield v;
        }
      }

      for (const v of setA.values()) {
        yield v;
      }
    }

    a = [1,2,3];
    b = [2,3,4];

    console.log(Array.from(setMinus(a, b)));

While this is a bit more complex than many of the other solutions, when you have large lists this will be far faster.

Let's take a quick look at the performance difference, running it on a set of 1,000,000 random integers between 0...10,000 we see the following performance results.

setMinus time =  181 ms
    diff time =  19099 ms

function buildList(count, range) {
  result = [];
  for (i = 0; i < count; i++) {
    result.push(Math.floor(Math.random() * range))
  }
  return result;
}

function *setMinus(A, B) {
  const setA = new Set(A);
  const setB = new Set(B);

  for (const v of setB.values()) {
    if (!setA.delete(v)) {
        yield v;
    }
  }

  for (const v of setA.values()) {
    yield v;
  }
}

function doDiff(A, B) {
  return A.filter(function(x) { return B.indexOf(x) < 0 })
}

const listA = buildList(100_000, 100_000_000); 
const listB = buildList(100_000, 100_000_000); 

let t0 = process.hrtime.bigint()

const _x = Array.from(setMinus(listA, listB))

let t1 = process.hrtime.bigint()

const _y = doDiff(listA, listB)

let t2 = process.hrtime.bigint()

console.log("setMinus time = ", (t1 - t0) / 1_000_000n, "ms");
console.log("diff time = ", (t2 - t1) / 1_000_000n, "ms");