Skipped Count 0 In Aggregate Function
Solution 1:
The group stage is producing documents based on grouping on your given _id and counting the number of documents from the previous stage that end up in the group. Hence, a count of zero would be the result of a document being created from 0 input documents belonging to the group. Thinking about it this way, it's clear that there's no way the aggregation pipeline can do this for you. It doesn't know what all of the "missing" time periods are and it can't invent the appropriate documents out of thin air. Reapplying your extra knowledge about the missing time periods to complete the picture at the end seems like a reasonable solution (not "hacky") if you need to have an explicit count of 0 for empty time periods.
Solution 2:
Though it has already been said the best thing to do here is "merge" your results post process rather than expect "keys" that do not exist to appear or to issue multiple queries with explicit keys that are possibly not going to aggregate results and combine them.
What has not already been said is how you actually do this, so I'll give you a MongoDB "thinking" kind of way to collect your results.
As a quick disclaimer, you could possibly employ much the same approach by "seeding" empty keys for each interval using mapReduce, or possibly even altering your data so that there is always an empty value within each possible block. Those approaches seem basically "hacky" and in the mapReduce case is not going to provide the best performance or muliple results.
What I would suggest is that working with collection results for the MongoDB brain can be made simple. There is a neat little solution called neDB, which is billed as a kind of SQL Lite for MongoDB. It supports a subset of functionality and is therefore perfect for "in memory" manipulation of results with a MongoDB mindset:
varasync = require('async'),
mongoose = require('mongoose'),
DataStore = require('nedb'),
Schema = mongoose.Schema;
var documentSchema = newSchema({
time: { type: Date, default: Date.now }
});
varDocument = mongoose.model( "Document", documentSchema );
mongoose.connect('mongodb://localhost/test');
var getCount = function(timeBlock, start, end, callback) {
async.waterfall(
[
// Fill a blank seriesfunction(callback) {
var db = newDataStore();
var current = start;
async.whilst(
function() { return current < end },
function(callback) {
var delta = end - current;
db.insert({ "_id": end - delta, "count": 0 },function(err,doc) {
//console.log( doc );
current += timeBlock;
callback(err);
});
},
function(err) {
callback(err,db);
}
);
},
// Get data and updatefunction(db,callback) {
var cursor = Document.collection.aggregate(
[
// Match documents
{ "$match": {
"time": {
"$gte": newDate(start),
"$lt": newDate(end)
}
}},
// Group. 1 step and less hacky
{ "$group": {
"_id": {
"$let": {
"vars": {
"delta": {
"$subtract": [
{ "$subtract": [ newDate(end), "$time" ] },
{ "$mod": [
{ "$subtract": [ newDate(end), "$time" ] },
timeBlock
]}
]
}
},
"in": { "$subtract": [ end, "$$delta" ] }
}
},
"count": { "$sum": 1 }
}}
],
{ "cursor": { "batchSize": 100 } }
);
cursor.on("data",function(item) {
cursor.pause();
console.log( "called" );
db.update(
{ "_id": item._id },
{ "$inc": { "count": item.count } },
{ "upsert": true },
function(err) {
cursor.resume();
}
);
});
cursor.on("end",function() {
console.log( "done" );
db.find({},function(err,result) {
callback(err,result);
});
});
}
],
function(err,result) {
callback(err,result);
}
);
}
mongoose.connection.on("open", function(err,conn) {
getCount(
1000 * 60 * 60, // each hournewDate("2014-07-01").valueOf(), // startnewDate("2014-07-02").valueOf(), // endfunction(err,result) {
if (err) throw err;
console.log( result );
}
);
});
So essentially create each interval as in memory collection and then just update those interval records with the actual data retrieved. I can't think of another way to do that where it would be more simple and natural to the way of thinking.
Just a footnote, the "interval" logic is just replicated from your question, but in fact the time periods are "rounded up" where 15 minutes would appear in hour 1. It usually is the practice to round down so that everything belongs to the interval it falls in and not the next one.
Solution 3:
this is hacky fix I did for now:
var getCount = function(timeBlock, start, end, cb) {
Document.aggregate(
{
$match: {
time: {
$gte: new Date(start),
$lt: new Date(end)
}
}
},
{
$project: {
time: 1,
delta: { $subtract: [
new Date(end),
'$time'
]}
}
},
{
$project: {
time: 1,
delta: { $subtract: [
"$delta",
{ $mod: [
"$delta",
timeBlock
]}
]}
}
},
{
$group: {
_id: { $subtract: [
end,
"$delta"
]},
count: { $sum: 1 }
}
},
{
$project: {
time: "$_id",
count: 1,
_id: 0
}
},
{
$sort: {
time: 1
}
}, function(err, results) {
if (err) {
cb(err)
} else {
// really hacky wayvar numOfTimeBlocks = ( end - start ) / timeBlock
// in case there is no 0s in the given period of time there is no need// to iterate through all of the resultsif ( results.length === numOfTimeBlocks ) {
cb(results);
} else {
var time = start;
var details = [];
var times = results.map(function(item) {
return item.time;
});
for( var i = 0; i < numOfTimeBlocks; i++) {
time += timeBlock;
var idx = times.indexOf(time);
if (idx > -1) {
details.push(results[idx]);
} else {
var documentCount = { count: 0, time: time };
details.push(documentCount);
}
}
cb(details);
}
}
})
}
I was also thinking about doing one query per time block, which gives the same result but I think is inefficient because you query the database N times.
Post a Comment for "Skipped Count 0 In Aggregate Function"