MongoDB 聚合数据缓慢,添加索引也无效,这是为什么?

MongoDB 查询十分缓慢

小弟不才,mongodb 优化时间紧任务重,因此向各位大佬提问。
目前 mongodb 中有 130w 条数据,现在要统计当月 appName 的数量。然而 130w 条数据就需要 5s 以上的查询。

相关代码

查询的脚本如下:

db.UserBehaviorOnApp.aggregate([
    // 匹配日期
   {
       $match: {
           "accessTime" : { 
               "$gte" : ISODate("2022-12-06T06:04:53Z"), 
               "$lt" : ISODate("2023-01-06T06:04:53Z")
           }
       }
   },
   // 根据 appName 进行分组,并且统计 appName 出现的次数。 $sum: 1 表示累计+1
   { $group: { _id: "$appName",  count: { $sum: 1 } } },
   // 根据 count 进行倒序排列
   { $sort: {count:-1} },
   // 取前五条记录
   { $limit : 5 }
])

使用 explain("executionStats") 分析结果如下:

{
    "explainVersion" : "1",
    "stages" : [
        {
            "$cursor" : {
                "queryPlanner" : {
                    "namespace" : "user_behavior_log.UserBehaviorOnApp",
                    "indexFilterSet" : false,
                    "parsedQuery" : {
                        "$and" : [
                            {
                                "accessTime" : {
                                    "$lt" : ISODate("2023-01-06T14:04:53.000+08:00")
                                }
                            },
                            {
                                "accessTime" : {
                                    "$gte" : ISODate("2022-12-06T14:04:53.000+08:00")
                                }
                            }
                        ]
                    },
                    "queryHash" : "BDCC37AF",
                    "planCacheKey" : "F85B6289",
                    "maxIndexedOrSolutionsReached" : false,
                    "maxIndexedAndSolutionsReached" : false,
                    "maxScansToExplodeReached" : false,
                    "winningPlan" : {
                        "stage" : "PROJECTION_SIMPLE",
                        "transformBy" : {
                            "appName" : 1,
                            "_id" : 0
                        },
                        "inputStage" : {
                            "stage" : "COLLSCAN",
                            "filter" : {
                                "$and" : [
                                    {
                                        "accessTime" : {
                                            "$lt" : ISODate("2023-01-06T14:04:53.000+08:00")
                                        }
                                    },
                                    {
                                        "accessTime" : {
                                            "$gte" : ISODate("2022-12-06T14:04:53.000+08:00")
                                        }
                                    }
                                ]
                            },
                            "direction" : "forward"
                        }
                    },
                    "rejectedPlans" : [ ]
                },
                "executionStats" : {
                    "executionSuccess" : true,
                    "nReturned" : 1333199,
                    "executionTimeMillis" : 5769,
                    "totalKeysExamined" : 0,
                    "totalDocsExamined" : 1333343,
                    "executionStages" : {
                        "stage" : "PROJECTION_SIMPLE",
                        "nReturned" : 1333199,
                        "executionTimeMillisEstimate" : 344,
                        "works" : 1333345,
                        "advanced" : 1333199,
                        "needTime" : 145,
                        "needYield" : 0,
                        "saveState" : 1379,
                        "restoreState" : 1379,
                        "isEOF" : 1,
                        "transformBy" : {
                            "appName" : 1,
                            "_id" : 0
                        },
                        "inputStage" : {
                            "stage" : "COLLSCAN",
                            "filter" : {
                                "$and" : [
                                    {
                                        "accessTime" : {
                                            "$lt" : ISODate("2023-01-06T14:04:53.000+08:00")
                                        }
                                    },
                                    {
                                        "accessTime" : {
                                            "$gte" : ISODate("2022-12-06T14:04:53.000+08:00")
                                        }
                                    }
                                ]
                            },
                            "nReturned" : 1333199,
                            "executionTimeMillisEstimate" : 250,
                            "works" : 1333345,
                            "advanced" : 1333199,
                            "needTime" : 145,
                            "needYield" : 0,
                            "saveState" : 1379,
                            "restoreState" : 1379,
                            "isEOF" : 1,
                            "direction" : "forward",
                            "docsExamined" : 1333343
                        }
                    }
                }
            },
            "nReturned" : 1333199,
            "executionTimeMillisEstimate" : 2222
        },
        {
            "$group" : {
                "_id" : "$appName",
                "count" : {
                    "$sum" : {
                        "$const" : 1
                    }
                }
            },
            "maxAccumulatorMemoryUsageBytes" : {
                "count" : 74767392
            },
            "totalOutputDataSizeBytes" : 237801844,
            "usedDisk" : false,
            "nReturned" : 1038436,
            "executionTimeMillisEstimate" : 5632
        },
        {
            "$sort" : {
                "sortKey" : {
                    "count" : -1
                },
                "limit" : 5
            },
            "totalDataSizeSortedBytesEstimate" : 12005,
            "usedDisk" : false,
            "nReturned" : 5,
            "executionTimeMillisEstimate" : 5768
        }
    ],
    "serverInfo" : {
        "host" : "02a8a2b6c8dc",
        "port" : 27017,
        "version" : "5.0.5",
        "gitVersion" : "d65fd89df3fc039b5c55933c0f71d647a54510ae"
    },
    "serverParameters" : {
        "internalQueryFacetBufferSizeBytes" : 104857600,
        "internalQueryFacetMaxOutputDocSizeBytes" : 104857600,
        "internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600,
        "internalDocumentSourceGroupMaxMemoryBytes" : 104857600,
        "internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600,
        "internalQueryProhibitBlockingMergeOnMongoS" : 0,
        "internalQueryMaxAddToSetBytes" : 104857600,
        "internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600
    },
    "command" : {
        "aggregate" : "UserBehaviorOnApp",
        "pipeline" : [
            {
                "$match" : {
                    "accessTime" : {
                        "$gte" : ISODate("2022-12-06T14:04:53.000+08:00"),
                        "$lt" : ISODate("2023-01-06T14:04:53.000+08:00")
                    }
                }
            },
            {
                "$group" : {
                    "_id" : "$appName",
                    "count" : {
                        "$sum" : 1
                    }
                }
            },
            {
                "$sort" : {
                    "count" : -1
                }
            },
            {
                "$limit" : 5
            }
        ],
        "cursor" : {
            
        },
        "$db" : "user_behavior_log"
    },
    "ok" : 1,
    "$clusterTime" : {
        "clusterTime" : Timestamp(1672991848, 1),
        "signature" : {
            "hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
            "keyId" : 0
        }
    },
    "operationTime" : Timestamp(1672991848, 1)
}

image.png

我的思路

我尝试着给 accessTime 添加索引

db.UserBehaviorOnApp.createIndex({"accessTime": 1})

但是查询时间反而更久了,虽然用上了索引。
image.png
结果

{
    "explainVersion" : "1",
    "stages" : [
        {
            "$cursor" : {
                "queryPlanner" : {
                    "namespace" : "user_behavior_log.UserBehaviorOnApp",
                    "indexFilterSet" : false,
                    "parsedQuery" : {
                        "$and" : [
                            {
                                "accessTime" : {
                                    "$lt" : ISODate("2023-01-06T14:04:53.000+08:00")
                                }
                            },
                            {
                                "accessTime" : {
                                    "$gte" : ISODate("2022-12-06T14:04:53.000+08:00")
                                }
                            }
                        ]
                    },
                    "queryHash" : "BDCC37AF",
                    "planCacheKey" : "D418D338",
                    "maxIndexedOrSolutionsReached" : false,
                    "maxIndexedAndSolutionsReached" : false,
                    "maxScansToExplodeReached" : false,
                    "winningPlan" : {
                        "stage" : "PROJECTION_SIMPLE",
                        "transformBy" : {
                            "appName" : 1,
                            "_id" : 0
                        },
                        "inputStage" : {
                            "stage" : "FETCH",
                            "inputStage" : {
                                "stage" : "IXSCAN",
                                "keyPattern" : {
                                    "accessTime" : 1
                                },
                                "indexName" : "accessTime_1",
                                "isMultiKey" : false,
                                "multiKeyPaths" : {
                                    "accessTime" : [ ]
                                },
                                "isUnique" : false,
                                "isSparse" : false,
                                "isPartial" : false,
                                "indexVersion" : 2,
                                "direction" : "forward",
                                "indexBounds" : {
                                    "accessTime" : [ "[new Date(1670306693000), new Date(1672985093000))" ]
                                }
                            }
                        }
                    },
                    "rejectedPlans" : [ ]
                },
                "executionStats" : {
                    "executionSuccess" : true,
                    "nReturned" : 1333199,
                    "executionTimeMillis" : 10107,
                    "totalKeysExamined" : 1333199,
                    "totalDocsExamined" : 1333199,
                    "executionStages" : {
                        "stage" : "PROJECTION_SIMPLE",
                        "nReturned" : 1333199,
                        "executionTimeMillisEstimate" : 2092,
                        "works" : 1333200,
                        "advanced" : 1333199,
                        "needTime" : 0,
                        "needYield" : 0,
                        "saveState" : 1379,
                        "restoreState" : 1379,
                        "isEOF" : 1,
                        "transformBy" : {
                            "appName" : 1,
                            "_id" : 0
                        },
                        "inputStage" : {
                            "stage" : "FETCH",
                            "nReturned" : 1333199,
                            "executionTimeMillisEstimate" : 1986,
                            "works" : 1333200,
                            "advanced" : 1333199,
                            "needTime" : 0,
                            "needYield" : 0,
                            "saveState" : 1379,
                            "restoreState" : 1379,
                            "isEOF" : 1,
                            "docsExamined" : 1333199,
                            "alreadyHasObj" : 0,
                            "inputStage" : {
                                "stage" : "IXSCAN",
                                "nReturned" : 1333199,
                                "executionTimeMillisEstimate" : 324,
                                "works" : 1333200,
                                "advanced" : 1333199,
                                "needTime" : 0,
                                "needYield" : 0,
                                "saveState" : 1379,
                                "restoreState" : 1379,
                                "isEOF" : 1,
                                "keyPattern" : {
                                    "accessTime" : 1
                                },
                                "indexName" : "accessTime_1",
                                "isMultiKey" : false,
                                "multiKeyPaths" : {
                                    "accessTime" : [ ]
                                },
                                "isUnique" : false,
                                "isSparse" : false,
                                "isPartial" : false,
                                "indexVersion" : 2,
                                "direction" : "forward",
                                "indexBounds" : {
                                    "accessTime" : [ "[new Date(1670306693000), new Date(1672985093000))" ]
                                },
                                "keysExamined" : 1333199,
                                "seeks" : 1,
                                "dupsTested" : 0,
                                "dupsDropped" : 0,
                                "indexDef" : {
                                    "indexName" : "accessTime_1",
                                    "isMultiKey" : false,
                                    "multiKeyPaths" : {
                                        "accessTime" : [ ]
                                    },
                                    "keyPattern" : {
                                        "accessTime" : 1
                                    },
                                    "isUnique" : false,
                                    "isSparse" : false,
                                    "isPartial" : false,
                                    "direction" : "forward"
                                }
                            }
                        }
                    }
                }
            },
            "nReturned" : 1333199,
            "executionTimeMillisEstimate" : 6492
        },
        {
            "$group" : {
                "_id" : "$appName",
                "count" : {
                    "$sum" : {
                        "$const" : 1
                    }
                }
            },
            "maxAccumulatorMemoryUsageBytes" : {
                "count" : 74767392
            },
            "totalOutputDataSizeBytes" : 237801844,
            "usedDisk" : false,
            "nReturned" : 1038436,
            "executionTimeMillisEstimate" : 9928
        },
        {
            "$sort" : {
                "sortKey" : {
                    "count" : -1
                },
                "limit" : 5
            },
            "totalDataSizeSortedBytesEstimate" : 11760,
            "usedDisk" : false,
            "nReturned" : 5,
            "executionTimeMillisEstimate" : 10101
        }
    ],
    "serverInfo" : {
        "host" : "02a8a2b6c8dc",
        "port" : 27017,
        "version" : "5.0.5",
        "gitVersion" : "d65fd89df3fc039b5c55933c0f71d647a54510ae"
    },
    "serverParameters" : {
        "internalQueryFacetBufferSizeBytes" : 104857600,
        "internalQueryFacetMaxOutputDocSizeBytes" : 104857600,
        "internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600,
        "internalDocumentSourceGroupMaxMemoryBytes" : 104857600,
        "internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600,
        "internalQueryProhibitBlockingMergeOnMongoS" : 0,
        "internalQueryMaxAddToSetBytes" : 104857600,
        "internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600
    },
    "command" : {
        "aggregate" : "UserBehaviorOnApp",
        "pipeline" : [
            {
                "$match" : {
                    "accessTime" : {
                        "$gte" : ISODate("2022-12-06T14:04:53.000+08:00"),
                        "$lt" : ISODate("2023-01-06T14:04:53.000+08:00")
                    }
                }
            },
            {
                "$group" : {
                    "_id" : "$appName",
                    "count" : {
                        "$sum" : 1
                    }
                }
            },
            {
                "$sort" : {
                    "count" : -1
                }
            },
            {
                "$limit" : 5
            }
        ],
        "cursor" : {
            
        },
        "$db" : "user_behavior_log"
    },
    "ok" : 1,
    "$clusterTime" : {
        "clusterTime" : Timestamp(1672991948, 3),
        "signature" : {
            "hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
            "keyId" : 0
        }
    },
    "operationTime" : Timestamp(1672991948, 3)
}

请问到底如何才能优化这条查询的性能?
文档结构如下:
image.png

回复
阅读 633
3 个回答

aggregate 是聚合,内部操作都是管道pipline,管道的含义很简单,就是筛选或者改变流向下一个阶段的数据。

match完成,还有 133万 数据,也就是流向下一个阶段就有这么多数据。

group就会读取这133万的数据进行计算,也就是读取每一个文档然后获取appName字段进行分组计算,这一步本身就超级慢。而且appName还是字符串类型,也会更慢。

sort又会进行一次133万数据排序。

这种场景本身mongo就非常不适合处理。因为他就是简单的阶段流程,不像有些sql有各种算法。

这种场景,一般来说:

  1. 避免太长的筛选条件,也就是第一步需要将数据量缩小
  2. 如果确实需求太长,那么需要后台提前计算,然后存储起来

这种实时分析还是得clickhouse,其他都不行。

又看了题主的需求,既然是数据量过大导致的查询过慢,根据需求,按照你最后给的思路。是否可以按照以下思路来拆分:
既然是当月数据的分组查询,可不可以在每天初始弄个定时任务,统计以下前一天的数据,并将结果存储到一张history表中。
则需求就可以变成:
从history表中取30条(30天)的数据进行汇总统计出appName出现前5的数据。
在根据该数据去UserBehaviorOnApp进行查询即可。可以分5次等值,或者一次in来进行查询。
这样就规避了大数据量的一次性分组耗时长的问题。

添加hint试试呢

db.UserBehaviorOnApp.aggregate([
    // 匹配日期
   {
       $match: {
           "accessTime" : { 
               "$gte" : ISODate("2022-12-06T06:04:53Z"), 
               "$lt" : ISODate("2023-01-06T06:04:53Z")
           }
       }
   },
   // 根据 appName 进行分组,并且统计 appName 出现的次数。 $sum: 1 表示累计+1
   { $group: { _id: "$appName",  count: { $sum: 1 } } },
   // 根据 count 进行倒序排列
   { $sort: {count:-1} },
   // 取前五条记录
   { $limit : 5 },
   // 使用指定的索引执行查询
   { $hint : {accessTime: 1} }
])

根据时间去 $match 数据,然后再根据一个字符串进行 $group,这样当数据量大时确实很耗时。但是如果不想换别的方案,我这里也分享一个 stackoverflow 上大佬给我的建议,虽然我执行他提供的脚本并没有搜索到结果,但是我觉得可能是我对 mongodb 管道操作理解不到位(我接触mongo没太久),这里我分享出来,也请大家帮我看看。
那位大佬的思想就是将一次性查询 30 天变为 30 次查一天的查询,然后使用 $merge 来进行合并。因为第一阶段 $match 后到 appName 还有 133w 的基数,因此在索引上做优化可能提升空间不大。他推荐我跨 UserBehaviorOnApp 聚合并将结果 $merge 到一个新的集合。脚本如下:

db.UserBehaviorOnApp.aggregate([
{ $match: { accessTime: { $gt: lastTimeIRanThis } } },
{ $group: {
  _id: {
    frequency: "hour",
    appName: "$appName",
    startTime: { $dateTrunc: { date: "$accessTime", unit: "hour" } }
  },
  count: { $sum: 1 },
}},
{ $merge: {
  into: "UserBehaviorOnAppAggregated",
  on: "_id",
  whenNotMatched: "insert",
  whenMatched: [
    { $set: { count: {$add: ["$count", "$$new.count"] } } }
  ]
}}
])
db.UserBehaviorOnAppAggregated.aggregate([
{ $match: { "_id.frequency": "hour", "_id.startTime": { $gt: lastTimeIRanThisRoundedToNearestHour } } },
{ $group: {
  _id: {
    frequency: "day",
    appName: "$_id.appName",
    startTime: { $dateTrunc: { date: "$_id.startTime", unit: "day" } }
  },
  count: { $sum: 1 },
}},
{ $merge: {
  into: "UserBehaviorOnAppAggregated",
  on: "_id",
  whenNotMatched: "insert",
  whenMatched: [
    { $set: { count: {$add: ["$count", "$$new.count"] } } }
  ]
}}
])
db.UserBehaviorOnAppAggregated.aggregate([
    { $match: {
      "_id.frequency": "day",
      "_id.startTime" : { "$gte" : ISODate("2022-12-06T00:00:00Z"), "$lt" : ISODate("2023-01-05T23:59:59Z")}
    }},
    { $group: { _id: "$_id.appName",  count: { $sum: 1 } } },
    { $sort: {count: -1} },
    { $limit : 5 },
])

具体的解释可以看:https://stackoverflow.com/que...

如果有大佬能够通过该方法得到一个有效的提升或优化,还请不吝赐教分享一下。

宣传栏