MongoDB select count(distinct x) on an indexed column - count unique results for large data sets
1) The easiest way to do this is via the aggregation framework. This takes two "$group" commands: the first one groups by distinct values, the second one counts all of the distinct values
pipeline = [
{ $group: { _id: "$myIndexedNonUniqueField"} },
{ $group: { _id: 1, count: { $sum: 1 } } }
];
//
// Run the aggregation command
//
R = db.runCommand(
{
"aggregate": "myCollection" ,
"pipeline": pipeline
}
);
printjson(R);
2) If you want to do this with Map/Reduce you can. This is also a two-phase process: in the first phase we build a new collection with a list of every distinct value for the key. In the second we do a count() on the new collection.
var SOURCE = db.myCollection;
var DEST = db.distinct
DEST.drop();
map = function() {
emit( this.myIndexedNonUniqueField , {count: 1});
}
reduce = function(key, values) {
var count = 0;
values.forEach(function(v) {
count += v['count']; // count each distinct value for lagniappe
});
return {count: count};
};
//
// run map/reduce
//
res = SOURCE.mapReduce( map, reduce,
{ out: 'distinct',
verbose: true
}
);
print( "distinct count= " + res.counts.output );
print( "distinct count=", DEST.count() );
Note that you cannot return the result of the map/reduce inline, because that will potentially overrun the 16MB document size limit. You can save the calculation in a collection and then count() the size of the collection, or you can get the number of results from the return value of mapReduce().
db.myCollection.aggregate(
{$group : {_id : "$myIndexedNonUniqueField"} },
{$group: {_id:1, count: {$sum : 1 }}});
straight to result:
db.myCollection.aggregate(
{$group : {_id : "$myIndexedNonUniqueField"} },
{$group: {_id:1, count: {$sum : 1 }}})
.result[0].count;
Following solution worked for me
db.test.distinct('user'); [ "alex", "England", "France", "Australia" ]
db.countries.distinct('country').length 4