Why MongoDB is so slow on some queries while using IXSCAN as a winning plan?












3















I have DB with about 30M records in a collection with about 100GB collection size (total documents and indexes).



I have a compound index that filters data based on user_id and some other fields like: is_active, is_logged_in, etc.



With MongoDB Compass I see slow queries of about 10s, 20s or even 40 seconds! I ran the exact same query and result is fetched less than 500ms (though it may get cached on second try).



When I get that ongoing op stat, I see the following lock status:



 "lockStats": {
"Global": {
"acquireCount": {
"r": 574
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 295
},
"acquireWaitCount": {
"r": 2
},
"timeAcquiringMicros": {
"r": 15494
}
},
}


acquireCount: Number of times the operation acquired the lock in the specified mode is so high compared to the fast query (on another collection) that has the below status:



"lockStats": {
"Global": {
"acquireCount": {
"r": 2
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 1
}
},
"Database": {
"acquireCount": {
"r": 1
}
},
"Collection": {
"acquireCount": {
"R": 1
}
}
}


When operations are slow, and a user that has many records take so long it has domino effect on all the other operations after a few seconds.



When I explain the query on the large collection I can see the result that it has used an index:



{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cuda.call_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"user_id" : {
"$eq" : "00000000000040008000000000002a5d"
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"date" : -1
},
"indexName" : "user_id_date",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"date" : [
"("2018-01-09 10:36:31", "1970-01-01 00:00:00")"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"to" : 1,
"from" : 1
},
"indexName" : "user_id_to_from",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"to" : [
"[MinKey, MaxKey]"
],
"from" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4682,
"executionTimeMillis" : 2072,
"totalKeysExamined" : 4688,
"totalDocsExamined" : 4682,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"nReturned" : 4682,
"executionTimeMillisEstimate" : 710,
"works" : 4897,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 209,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"docsExamined" : 4682,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4682,
"executionTimeMillisEstimate" : 305,
"works" : 4688,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 0,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
},
"keysExamined" : 4688,
"seeks" : 6,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : “hs1.mydomain.com”,
"port" : 27017,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1.0
}


keysExamined is just 4,688! It is not much compared to the total data of the collection which is 30M documents. When Mongo gets slow when it has domino effect, CPU usage and Memory is not high. Mongo just uses 40% of the memory. Disk partition is Ext4 if that helps.



Another example of a very slow query in full details:



{
"desc": "conn199276",
"threadId": "140070259820288",
"connectionId": 199276,
"client": "client_server_ip:45590",
"active": "true",
"opid": 63869351,
"secs_running": 36,
"microsecs_running": 36136211,
"op": "query",
"ns": "cuda.call_history",
"query": {
"find": "call_history",
"filter": {
"is_removed": false,
"trk.0.extra_data.spam.is_spam": true,
"trk.0.direction": "ingress",
"date": {
"$gt": "1970-01-01 00:00:00",
"$lt": "4001-01-01 00:00:00"
},
"trk.0.extra_data.status": {
"$in": [
"success",
"p_received"
]
},
"trk.0.type": "clk",
"owner_id": "00000000000040008000000000003828"
},
"sort": {
"date": -1
},
"limit": 31
},
"numYields": 6600,
"locks": {},
"waitingForLock": "false",
"lockStats": {
"Global": {
"acquireCount": {
"r": 13200
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 6611
},
"acquireWaitCount": {
"r": 9
},
"timeAcquiringMicros": {
"r": 50854
}
},
"Database": {
"acquireCount": {
"r": 6600
}
},
"Collection": {
"acquireCount": {
"R": 6600
},
"acquireWaitCount": {
"R": 11
},
"timeAcquiringMicros": {
"R": 163707
}
}
}
}


Output of db.stats():



rs0:PRIMARY> db.stats()
{
"db" : "cuda",
"collections" : 5,
"views" : 0,
"objects" : 55009248,
"avgObjSize" : 2018.6135346551184,
"dataSize" : 111042412544,
"storageSize" : 113055362336,
"numExtents" : 100,
"indexes" : 7,
"indexSize" : 14223460160,
"fileSize" : 133012914176,
"nsSizeMB" : 16,
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"dataFileVersion" : {
"major" : 4,
"minor" : 22
},
"ok" : 1
}


mongostat displays the below result, I think fault number is high:



insert query update delete getmore command flushes mapped vsize   res faults qrw arw net_in net_out conn set repl                time
5 93 4 *0 0 64|0 0 282G 9.11G 26 0|0 0|0 64.3k 187k 481 rs0 PRI Jan 10 06:25:14.476
*0 107 *0 1 0 58|0 0 282G 9.14G 4 0|0 0|0 51.5k 247k 481 rs0 PRI Jan 10 06:25:15.475
2 88 5 *0 0 70|0 0 282G 9.04G 26 0|0 0|0 61.5k 245k 481 rs0 PRI Jan 10 06:25:16.476
3 98 2 *0 0 71|0 0 282G 9.12G 6 0|0 0|0 59.6k 274k 481 rs0 PRI Jan 10 06:25:17.474
1 105 *0 1 0 82|0 0 282G 9.10G 14 0|0 0|0 63.4k 254k 481 rs0 PRI Jan 10 06:25:18.476
1 88 2 *0 0 68|0 0 282G 9.12G 75 0|0 0|0 49.9k 194k 481 rs0 PRI Jan 10 06:25:19.476
5 120 4 *0 0 78|0 0 282G 9.11G 5 0|0 0|0 75.7k 548k 484 rs0 PRI Jan 10 06:25:20.476
3 89 *0 4 0 64|0 0 282G 9.14G 19 0|0 0|0 55.3k 1.59m 483 rs0 PRI Jan 10 06:25:21.476
*0 81 *0 *0 0 62|0 0 282G 9.17G 0 0|0 0|0 46.9k 265k 481 rs0 PRI Jan 10 06:25:22.476
6 105 3 *0 0 67|0 0 282G 9.15G 49 0|0 0|0 61.8k 240k 481 rs0 PRI Jan 10 06:25:23.475


And under load we have high page faults:



insert query update delete getmore command flushes mapped vsize   res faults  qrw  arw net_in net_out conn set repl                time
1 96 *0 *0 0 70|0 0 282G 10.9G 597 0|0 2|0 59.1k 1.06m 484 rs0 PRI Jan 10 07:48:45.061
1 103 3 *0 0 79|0 0 282G 10.9G 715 0|0 0|0 66.4k 176k 484 rs0 PRI Jan 10 07:48:46.060
2 102 *0 *0 0 72|0 0 282G 10.9G 718 0|0 2|0 59.6k 154k 484 rs0 PRI Jan 10 07:48:47.060
2 104 2 1 0 83|0 0 282G 11.0G 692 0|0 3|0 68.7k 154k 483 rs0 PRI Jan 10 07:48:48.061
2 103 2 *0 0 86|0 0 282G 11.0G 743 0|0 1|0 64.2k 154k 482 rs0 PRI Jan 10 07:48:49.059
*0 116 2 *0 0 95|0 0 282G 11.0G 715 0|0 2|0 68.5k 237k 484 rs0 PRI Jan 10 07:48:50.060
1 112 6 1 0 78|0 0 282G 11.0G 595 0|0 2|0 74.3k 231k 481 rs0 PRI Jan 10 07:48:51.061
2 137 5 1 0 83|0 0 282G 11.0G 733 0|0 3|0 81.6k 301k 480 rs0 PRI Jan 10 07:48:52.060
3 157 5 1 0 92|0 0 282G 11.0G 697 0|0 3|0 89.5k 183k 481 rs0 PRI Jan 10 07:48:53.059
4 102 3 *0 0 73|0 0 282G 10.9G 599 0|0 1|0 67.1k 154k 480 rs0 PRI Jan 10 07:48:54.060


We have about 15GB of free memory that MongoDB does not use!



On peak we reach 400 OPS, and 500 connections per seconds. Is there anything I could do to improve the performance?





EDIT



iostat output of disk:



Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda 0.00 87.00 7.00 392.00 460.00 2544.00 15.06 0.08 0.22 8.00 0.08 0.12 4.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 7.00 478.00 460.00 2536.00 12.35 0.09 0.19 8.00 0.08 0.10 4.80

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 84.00 6.00 3620.00 56.00 81.69 0.82 8.84 9.48 0.00 4.13 37.20
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 85.00 6.00 3692.00 56.00 82.37 0.82 8.75 9.36 0.00 4.09 37.20

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 57.00 3.00 2804.00 52.00 95.20 0.44 7.73 7.79 6.67 4.80 28.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 56.00 3.00 2732.00 52.00 94.37 0.44 7.80 7.86 6.67 4.88 28.80


Another snapshot:



Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda 0.00 11.00 267.67 5.00 20813.33 97.33 153.38 1.28 4.68 4.70 3.73 3.56 97.07
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 267.67 16.00 20806.67 97.33 147.38 1.28 4.50 4.70 1.17 3.42 97.07

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 21.00 221.67 3.00 19089.33 110.67 170.92 1.18 5.26 5.16 12.44 3.95 88.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 221.33 24.00 19061.33 110.67 156.29 1.18 4.82 5.17 1.56 3.62 88.80




EDIT 2:



The output of iostat 1 in order to remove extra information based on read and write:



https://pastebin.com/32WuRSPF



Another result which tps reached around 1000:



https://pastebin.com/mrQdKZZF










share|improve this question
















bumped to the homepage by Community 10 mins ago


This question has answers that may be good or bad; the system has marked it active so that they can be reviewed.











  • 1





    Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.

    – JJussi
    Jan 12 '18 at 5:43











  • @JJussi Please see the edit part of the question, I have added iostat output.

    – ALH
    Jan 13 '18 at 8:59











  • @JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?

    – ALH
    Jan 13 '18 at 10:25











  • ?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.

    – JJussi
    Jan 13 '18 at 12:51






  • 1





    No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.

    – JJussi
    Jan 15 '18 at 4:13
















3















I have DB with about 30M records in a collection with about 100GB collection size (total documents and indexes).



I have a compound index that filters data based on user_id and some other fields like: is_active, is_logged_in, etc.



With MongoDB Compass I see slow queries of about 10s, 20s or even 40 seconds! I ran the exact same query and result is fetched less than 500ms (though it may get cached on second try).



When I get that ongoing op stat, I see the following lock status:



 "lockStats": {
"Global": {
"acquireCount": {
"r": 574
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 295
},
"acquireWaitCount": {
"r": 2
},
"timeAcquiringMicros": {
"r": 15494
}
},
}


acquireCount: Number of times the operation acquired the lock in the specified mode is so high compared to the fast query (on another collection) that has the below status:



"lockStats": {
"Global": {
"acquireCount": {
"r": 2
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 1
}
},
"Database": {
"acquireCount": {
"r": 1
}
},
"Collection": {
"acquireCount": {
"R": 1
}
}
}


When operations are slow, and a user that has many records take so long it has domino effect on all the other operations after a few seconds.



When I explain the query on the large collection I can see the result that it has used an index:



{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cuda.call_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"user_id" : {
"$eq" : "00000000000040008000000000002a5d"
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"date" : -1
},
"indexName" : "user_id_date",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"date" : [
"("2018-01-09 10:36:31", "1970-01-01 00:00:00")"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"to" : 1,
"from" : 1
},
"indexName" : "user_id_to_from",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"to" : [
"[MinKey, MaxKey]"
],
"from" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4682,
"executionTimeMillis" : 2072,
"totalKeysExamined" : 4688,
"totalDocsExamined" : 4682,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"nReturned" : 4682,
"executionTimeMillisEstimate" : 710,
"works" : 4897,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 209,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"docsExamined" : 4682,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4682,
"executionTimeMillisEstimate" : 305,
"works" : 4688,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 0,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
},
"keysExamined" : 4688,
"seeks" : 6,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : “hs1.mydomain.com”,
"port" : 27017,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1.0
}


keysExamined is just 4,688! It is not much compared to the total data of the collection which is 30M documents. When Mongo gets slow when it has domino effect, CPU usage and Memory is not high. Mongo just uses 40% of the memory. Disk partition is Ext4 if that helps.



Another example of a very slow query in full details:



{
"desc": "conn199276",
"threadId": "140070259820288",
"connectionId": 199276,
"client": "client_server_ip:45590",
"active": "true",
"opid": 63869351,
"secs_running": 36,
"microsecs_running": 36136211,
"op": "query",
"ns": "cuda.call_history",
"query": {
"find": "call_history",
"filter": {
"is_removed": false,
"trk.0.extra_data.spam.is_spam": true,
"trk.0.direction": "ingress",
"date": {
"$gt": "1970-01-01 00:00:00",
"$lt": "4001-01-01 00:00:00"
},
"trk.0.extra_data.status": {
"$in": [
"success",
"p_received"
]
},
"trk.0.type": "clk",
"owner_id": "00000000000040008000000000003828"
},
"sort": {
"date": -1
},
"limit": 31
},
"numYields": 6600,
"locks": {},
"waitingForLock": "false",
"lockStats": {
"Global": {
"acquireCount": {
"r": 13200
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 6611
},
"acquireWaitCount": {
"r": 9
},
"timeAcquiringMicros": {
"r": 50854
}
},
"Database": {
"acquireCount": {
"r": 6600
}
},
"Collection": {
"acquireCount": {
"R": 6600
},
"acquireWaitCount": {
"R": 11
},
"timeAcquiringMicros": {
"R": 163707
}
}
}
}


Output of db.stats():



rs0:PRIMARY> db.stats()
{
"db" : "cuda",
"collections" : 5,
"views" : 0,
"objects" : 55009248,
"avgObjSize" : 2018.6135346551184,
"dataSize" : 111042412544,
"storageSize" : 113055362336,
"numExtents" : 100,
"indexes" : 7,
"indexSize" : 14223460160,
"fileSize" : 133012914176,
"nsSizeMB" : 16,
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"dataFileVersion" : {
"major" : 4,
"minor" : 22
},
"ok" : 1
}


mongostat displays the below result, I think fault number is high:



insert query update delete getmore command flushes mapped vsize   res faults qrw arw net_in net_out conn set repl                time
5 93 4 *0 0 64|0 0 282G 9.11G 26 0|0 0|0 64.3k 187k 481 rs0 PRI Jan 10 06:25:14.476
*0 107 *0 1 0 58|0 0 282G 9.14G 4 0|0 0|0 51.5k 247k 481 rs0 PRI Jan 10 06:25:15.475
2 88 5 *0 0 70|0 0 282G 9.04G 26 0|0 0|0 61.5k 245k 481 rs0 PRI Jan 10 06:25:16.476
3 98 2 *0 0 71|0 0 282G 9.12G 6 0|0 0|0 59.6k 274k 481 rs0 PRI Jan 10 06:25:17.474
1 105 *0 1 0 82|0 0 282G 9.10G 14 0|0 0|0 63.4k 254k 481 rs0 PRI Jan 10 06:25:18.476
1 88 2 *0 0 68|0 0 282G 9.12G 75 0|0 0|0 49.9k 194k 481 rs0 PRI Jan 10 06:25:19.476
5 120 4 *0 0 78|0 0 282G 9.11G 5 0|0 0|0 75.7k 548k 484 rs0 PRI Jan 10 06:25:20.476
3 89 *0 4 0 64|0 0 282G 9.14G 19 0|0 0|0 55.3k 1.59m 483 rs0 PRI Jan 10 06:25:21.476
*0 81 *0 *0 0 62|0 0 282G 9.17G 0 0|0 0|0 46.9k 265k 481 rs0 PRI Jan 10 06:25:22.476
6 105 3 *0 0 67|0 0 282G 9.15G 49 0|0 0|0 61.8k 240k 481 rs0 PRI Jan 10 06:25:23.475


And under load we have high page faults:



insert query update delete getmore command flushes mapped vsize   res faults  qrw  arw net_in net_out conn set repl                time
1 96 *0 *0 0 70|0 0 282G 10.9G 597 0|0 2|0 59.1k 1.06m 484 rs0 PRI Jan 10 07:48:45.061
1 103 3 *0 0 79|0 0 282G 10.9G 715 0|0 0|0 66.4k 176k 484 rs0 PRI Jan 10 07:48:46.060
2 102 *0 *0 0 72|0 0 282G 10.9G 718 0|0 2|0 59.6k 154k 484 rs0 PRI Jan 10 07:48:47.060
2 104 2 1 0 83|0 0 282G 11.0G 692 0|0 3|0 68.7k 154k 483 rs0 PRI Jan 10 07:48:48.061
2 103 2 *0 0 86|0 0 282G 11.0G 743 0|0 1|0 64.2k 154k 482 rs0 PRI Jan 10 07:48:49.059
*0 116 2 *0 0 95|0 0 282G 11.0G 715 0|0 2|0 68.5k 237k 484 rs0 PRI Jan 10 07:48:50.060
1 112 6 1 0 78|0 0 282G 11.0G 595 0|0 2|0 74.3k 231k 481 rs0 PRI Jan 10 07:48:51.061
2 137 5 1 0 83|0 0 282G 11.0G 733 0|0 3|0 81.6k 301k 480 rs0 PRI Jan 10 07:48:52.060
3 157 5 1 0 92|0 0 282G 11.0G 697 0|0 3|0 89.5k 183k 481 rs0 PRI Jan 10 07:48:53.059
4 102 3 *0 0 73|0 0 282G 10.9G 599 0|0 1|0 67.1k 154k 480 rs0 PRI Jan 10 07:48:54.060


We have about 15GB of free memory that MongoDB does not use!



On peak we reach 400 OPS, and 500 connections per seconds. Is there anything I could do to improve the performance?





EDIT



iostat output of disk:



Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda 0.00 87.00 7.00 392.00 460.00 2544.00 15.06 0.08 0.22 8.00 0.08 0.12 4.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 7.00 478.00 460.00 2536.00 12.35 0.09 0.19 8.00 0.08 0.10 4.80

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 84.00 6.00 3620.00 56.00 81.69 0.82 8.84 9.48 0.00 4.13 37.20
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 85.00 6.00 3692.00 56.00 82.37 0.82 8.75 9.36 0.00 4.09 37.20

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 57.00 3.00 2804.00 52.00 95.20 0.44 7.73 7.79 6.67 4.80 28.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 56.00 3.00 2732.00 52.00 94.37 0.44 7.80 7.86 6.67 4.88 28.80


Another snapshot:



Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda 0.00 11.00 267.67 5.00 20813.33 97.33 153.38 1.28 4.68 4.70 3.73 3.56 97.07
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 267.67 16.00 20806.67 97.33 147.38 1.28 4.50 4.70 1.17 3.42 97.07

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 21.00 221.67 3.00 19089.33 110.67 170.92 1.18 5.26 5.16 12.44 3.95 88.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 221.33 24.00 19061.33 110.67 156.29 1.18 4.82 5.17 1.56 3.62 88.80




EDIT 2:



The output of iostat 1 in order to remove extra information based on read and write:



https://pastebin.com/32WuRSPF



Another result which tps reached around 1000:



https://pastebin.com/mrQdKZZF










share|improve this question
















bumped to the homepage by Community 10 mins ago


This question has answers that may be good or bad; the system has marked it active so that they can be reviewed.











  • 1





    Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.

    – JJussi
    Jan 12 '18 at 5:43











  • @JJussi Please see the edit part of the question, I have added iostat output.

    – ALH
    Jan 13 '18 at 8:59











  • @JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?

    – ALH
    Jan 13 '18 at 10:25











  • ?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.

    – JJussi
    Jan 13 '18 at 12:51






  • 1





    No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.

    – JJussi
    Jan 15 '18 at 4:13














3












3








3


1






I have DB with about 30M records in a collection with about 100GB collection size (total documents and indexes).



I have a compound index that filters data based on user_id and some other fields like: is_active, is_logged_in, etc.



With MongoDB Compass I see slow queries of about 10s, 20s or even 40 seconds! I ran the exact same query and result is fetched less than 500ms (though it may get cached on second try).



When I get that ongoing op stat, I see the following lock status:



 "lockStats": {
"Global": {
"acquireCount": {
"r": 574
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 295
},
"acquireWaitCount": {
"r": 2
},
"timeAcquiringMicros": {
"r": 15494
}
},
}


acquireCount: Number of times the operation acquired the lock in the specified mode is so high compared to the fast query (on another collection) that has the below status:



"lockStats": {
"Global": {
"acquireCount": {
"r": 2
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 1
}
},
"Database": {
"acquireCount": {
"r": 1
}
},
"Collection": {
"acquireCount": {
"R": 1
}
}
}


When operations are slow, and a user that has many records take so long it has domino effect on all the other operations after a few seconds.



When I explain the query on the large collection I can see the result that it has used an index:



{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cuda.call_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"user_id" : {
"$eq" : "00000000000040008000000000002a5d"
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"date" : -1
},
"indexName" : "user_id_date",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"date" : [
"("2018-01-09 10:36:31", "1970-01-01 00:00:00")"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"to" : 1,
"from" : 1
},
"indexName" : "user_id_to_from",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"to" : [
"[MinKey, MaxKey]"
],
"from" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4682,
"executionTimeMillis" : 2072,
"totalKeysExamined" : 4688,
"totalDocsExamined" : 4682,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"nReturned" : 4682,
"executionTimeMillisEstimate" : 710,
"works" : 4897,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 209,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"docsExamined" : 4682,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4682,
"executionTimeMillisEstimate" : 305,
"works" : 4688,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 0,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
},
"keysExamined" : 4688,
"seeks" : 6,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : “hs1.mydomain.com”,
"port" : 27017,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1.0
}


keysExamined is just 4,688! It is not much compared to the total data of the collection which is 30M documents. When Mongo gets slow when it has domino effect, CPU usage and Memory is not high. Mongo just uses 40% of the memory. Disk partition is Ext4 if that helps.



Another example of a very slow query in full details:



{
"desc": "conn199276",
"threadId": "140070259820288",
"connectionId": 199276,
"client": "client_server_ip:45590",
"active": "true",
"opid": 63869351,
"secs_running": 36,
"microsecs_running": 36136211,
"op": "query",
"ns": "cuda.call_history",
"query": {
"find": "call_history",
"filter": {
"is_removed": false,
"trk.0.extra_data.spam.is_spam": true,
"trk.0.direction": "ingress",
"date": {
"$gt": "1970-01-01 00:00:00",
"$lt": "4001-01-01 00:00:00"
},
"trk.0.extra_data.status": {
"$in": [
"success",
"p_received"
]
},
"trk.0.type": "clk",
"owner_id": "00000000000040008000000000003828"
},
"sort": {
"date": -1
},
"limit": 31
},
"numYields": 6600,
"locks": {},
"waitingForLock": "false",
"lockStats": {
"Global": {
"acquireCount": {
"r": 13200
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 6611
},
"acquireWaitCount": {
"r": 9
},
"timeAcquiringMicros": {
"r": 50854
}
},
"Database": {
"acquireCount": {
"r": 6600
}
},
"Collection": {
"acquireCount": {
"R": 6600
},
"acquireWaitCount": {
"R": 11
},
"timeAcquiringMicros": {
"R": 163707
}
}
}
}


Output of db.stats():



rs0:PRIMARY> db.stats()
{
"db" : "cuda",
"collections" : 5,
"views" : 0,
"objects" : 55009248,
"avgObjSize" : 2018.6135346551184,
"dataSize" : 111042412544,
"storageSize" : 113055362336,
"numExtents" : 100,
"indexes" : 7,
"indexSize" : 14223460160,
"fileSize" : 133012914176,
"nsSizeMB" : 16,
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"dataFileVersion" : {
"major" : 4,
"minor" : 22
},
"ok" : 1
}


mongostat displays the below result, I think fault number is high:



insert query update delete getmore command flushes mapped vsize   res faults qrw arw net_in net_out conn set repl                time
5 93 4 *0 0 64|0 0 282G 9.11G 26 0|0 0|0 64.3k 187k 481 rs0 PRI Jan 10 06:25:14.476
*0 107 *0 1 0 58|0 0 282G 9.14G 4 0|0 0|0 51.5k 247k 481 rs0 PRI Jan 10 06:25:15.475
2 88 5 *0 0 70|0 0 282G 9.04G 26 0|0 0|0 61.5k 245k 481 rs0 PRI Jan 10 06:25:16.476
3 98 2 *0 0 71|0 0 282G 9.12G 6 0|0 0|0 59.6k 274k 481 rs0 PRI Jan 10 06:25:17.474
1 105 *0 1 0 82|0 0 282G 9.10G 14 0|0 0|0 63.4k 254k 481 rs0 PRI Jan 10 06:25:18.476
1 88 2 *0 0 68|0 0 282G 9.12G 75 0|0 0|0 49.9k 194k 481 rs0 PRI Jan 10 06:25:19.476
5 120 4 *0 0 78|0 0 282G 9.11G 5 0|0 0|0 75.7k 548k 484 rs0 PRI Jan 10 06:25:20.476
3 89 *0 4 0 64|0 0 282G 9.14G 19 0|0 0|0 55.3k 1.59m 483 rs0 PRI Jan 10 06:25:21.476
*0 81 *0 *0 0 62|0 0 282G 9.17G 0 0|0 0|0 46.9k 265k 481 rs0 PRI Jan 10 06:25:22.476
6 105 3 *0 0 67|0 0 282G 9.15G 49 0|0 0|0 61.8k 240k 481 rs0 PRI Jan 10 06:25:23.475


And under load we have high page faults:



insert query update delete getmore command flushes mapped vsize   res faults  qrw  arw net_in net_out conn set repl                time
1 96 *0 *0 0 70|0 0 282G 10.9G 597 0|0 2|0 59.1k 1.06m 484 rs0 PRI Jan 10 07:48:45.061
1 103 3 *0 0 79|0 0 282G 10.9G 715 0|0 0|0 66.4k 176k 484 rs0 PRI Jan 10 07:48:46.060
2 102 *0 *0 0 72|0 0 282G 10.9G 718 0|0 2|0 59.6k 154k 484 rs0 PRI Jan 10 07:48:47.060
2 104 2 1 0 83|0 0 282G 11.0G 692 0|0 3|0 68.7k 154k 483 rs0 PRI Jan 10 07:48:48.061
2 103 2 *0 0 86|0 0 282G 11.0G 743 0|0 1|0 64.2k 154k 482 rs0 PRI Jan 10 07:48:49.059
*0 116 2 *0 0 95|0 0 282G 11.0G 715 0|0 2|0 68.5k 237k 484 rs0 PRI Jan 10 07:48:50.060
1 112 6 1 0 78|0 0 282G 11.0G 595 0|0 2|0 74.3k 231k 481 rs0 PRI Jan 10 07:48:51.061
2 137 5 1 0 83|0 0 282G 11.0G 733 0|0 3|0 81.6k 301k 480 rs0 PRI Jan 10 07:48:52.060
3 157 5 1 0 92|0 0 282G 11.0G 697 0|0 3|0 89.5k 183k 481 rs0 PRI Jan 10 07:48:53.059
4 102 3 *0 0 73|0 0 282G 10.9G 599 0|0 1|0 67.1k 154k 480 rs0 PRI Jan 10 07:48:54.060


We have about 15GB of free memory that MongoDB does not use!



On peak we reach 400 OPS, and 500 connections per seconds. Is there anything I could do to improve the performance?





EDIT



iostat output of disk:



Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda 0.00 87.00 7.00 392.00 460.00 2544.00 15.06 0.08 0.22 8.00 0.08 0.12 4.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 7.00 478.00 460.00 2536.00 12.35 0.09 0.19 8.00 0.08 0.10 4.80

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 84.00 6.00 3620.00 56.00 81.69 0.82 8.84 9.48 0.00 4.13 37.20
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 85.00 6.00 3692.00 56.00 82.37 0.82 8.75 9.36 0.00 4.09 37.20

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 57.00 3.00 2804.00 52.00 95.20 0.44 7.73 7.79 6.67 4.80 28.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 56.00 3.00 2732.00 52.00 94.37 0.44 7.80 7.86 6.67 4.88 28.80


Another snapshot:



Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda 0.00 11.00 267.67 5.00 20813.33 97.33 153.38 1.28 4.68 4.70 3.73 3.56 97.07
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 267.67 16.00 20806.67 97.33 147.38 1.28 4.50 4.70 1.17 3.42 97.07

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 21.00 221.67 3.00 19089.33 110.67 170.92 1.18 5.26 5.16 12.44 3.95 88.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 221.33 24.00 19061.33 110.67 156.29 1.18 4.82 5.17 1.56 3.62 88.80




EDIT 2:



The output of iostat 1 in order to remove extra information based on read and write:



https://pastebin.com/32WuRSPF



Another result which tps reached around 1000:



https://pastebin.com/mrQdKZZF










share|improve this question
















I have DB with about 30M records in a collection with about 100GB collection size (total documents and indexes).



I have a compound index that filters data based on user_id and some other fields like: is_active, is_logged_in, etc.



With MongoDB Compass I see slow queries of about 10s, 20s or even 40 seconds! I ran the exact same query and result is fetched less than 500ms (though it may get cached on second try).



When I get that ongoing op stat, I see the following lock status:



 "lockStats": {
"Global": {
"acquireCount": {
"r": 574
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 295
},
"acquireWaitCount": {
"r": 2
},
"timeAcquiringMicros": {
"r": 15494
}
},
}


acquireCount: Number of times the operation acquired the lock in the specified mode is so high compared to the fast query (on another collection) that has the below status:



"lockStats": {
"Global": {
"acquireCount": {
"r": 2
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 1
}
},
"Database": {
"acquireCount": {
"r": 1
}
},
"Collection": {
"acquireCount": {
"R": 1
}
}
}


When operations are slow, and a user that has many records take so long it has domino effect on all the other operations after a few seconds.



When I explain the query on the large collection I can see the result that it has used an index:



{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cuda.call_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"user_id" : {
"$eq" : "00000000000040008000000000002a5d"
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"date" : -1
},
"indexName" : "user_id_date",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"date" : [
"("2018-01-09 10:36:31", "1970-01-01 00:00:00")"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"to" : 1,
"from" : 1
},
"indexName" : "user_id_to_from",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"to" : [
"[MinKey, MaxKey]"
],
"from" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4682,
"executionTimeMillis" : 2072,
"totalKeysExamined" : 4688,
"totalDocsExamined" : 4682,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"nReturned" : 4682,
"executionTimeMillisEstimate" : 710,
"works" : 4897,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 209,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"docsExamined" : 4682,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4682,
"executionTimeMillisEstimate" : 305,
"works" : 4688,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 0,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
},
"keysExamined" : 4688,
"seeks" : 6,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : “hs1.mydomain.com”,
"port" : 27017,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1.0
}


keysExamined is just 4,688! It is not much compared to the total data of the collection which is 30M documents. When Mongo gets slow when it has domino effect, CPU usage and Memory is not high. Mongo just uses 40% of the memory. Disk partition is Ext4 if that helps.



Another example of a very slow query in full details:



{
"desc": "conn199276",
"threadId": "140070259820288",
"connectionId": 199276,
"client": "client_server_ip:45590",
"active": "true",
"opid": 63869351,
"secs_running": 36,
"microsecs_running": 36136211,
"op": "query",
"ns": "cuda.call_history",
"query": {
"find": "call_history",
"filter": {
"is_removed": false,
"trk.0.extra_data.spam.is_spam": true,
"trk.0.direction": "ingress",
"date": {
"$gt": "1970-01-01 00:00:00",
"$lt": "4001-01-01 00:00:00"
},
"trk.0.extra_data.status": {
"$in": [
"success",
"p_received"
]
},
"trk.0.type": "clk",
"owner_id": "00000000000040008000000000003828"
},
"sort": {
"date": -1
},
"limit": 31
},
"numYields": 6600,
"locks": {},
"waitingForLock": "false",
"lockStats": {
"Global": {
"acquireCount": {
"r": 13200
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 6611
},
"acquireWaitCount": {
"r": 9
},
"timeAcquiringMicros": {
"r": 50854
}
},
"Database": {
"acquireCount": {
"r": 6600
}
},
"Collection": {
"acquireCount": {
"R": 6600
},
"acquireWaitCount": {
"R": 11
},
"timeAcquiringMicros": {
"R": 163707
}
}
}
}


Output of db.stats():



rs0:PRIMARY> db.stats()
{
"db" : "cuda",
"collections" : 5,
"views" : 0,
"objects" : 55009248,
"avgObjSize" : 2018.6135346551184,
"dataSize" : 111042412544,
"storageSize" : 113055362336,
"numExtents" : 100,
"indexes" : 7,
"indexSize" : 14223460160,
"fileSize" : 133012914176,
"nsSizeMB" : 16,
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"dataFileVersion" : {
"major" : 4,
"minor" : 22
},
"ok" : 1
}


mongostat displays the below result, I think fault number is high:



insert query update delete getmore command flushes mapped vsize   res faults qrw arw net_in net_out conn set repl                time
5 93 4 *0 0 64|0 0 282G 9.11G 26 0|0 0|0 64.3k 187k 481 rs0 PRI Jan 10 06:25:14.476
*0 107 *0 1 0 58|0 0 282G 9.14G 4 0|0 0|0 51.5k 247k 481 rs0 PRI Jan 10 06:25:15.475
2 88 5 *0 0 70|0 0 282G 9.04G 26 0|0 0|0 61.5k 245k 481 rs0 PRI Jan 10 06:25:16.476
3 98 2 *0 0 71|0 0 282G 9.12G 6 0|0 0|0 59.6k 274k 481 rs0 PRI Jan 10 06:25:17.474
1 105 *0 1 0 82|0 0 282G 9.10G 14 0|0 0|0 63.4k 254k 481 rs0 PRI Jan 10 06:25:18.476
1 88 2 *0 0 68|0 0 282G 9.12G 75 0|0 0|0 49.9k 194k 481 rs0 PRI Jan 10 06:25:19.476
5 120 4 *0 0 78|0 0 282G 9.11G 5 0|0 0|0 75.7k 548k 484 rs0 PRI Jan 10 06:25:20.476
3 89 *0 4 0 64|0 0 282G 9.14G 19 0|0 0|0 55.3k 1.59m 483 rs0 PRI Jan 10 06:25:21.476
*0 81 *0 *0 0 62|0 0 282G 9.17G 0 0|0 0|0 46.9k 265k 481 rs0 PRI Jan 10 06:25:22.476
6 105 3 *0 0 67|0 0 282G 9.15G 49 0|0 0|0 61.8k 240k 481 rs0 PRI Jan 10 06:25:23.475


And under load we have high page faults:



insert query update delete getmore command flushes mapped vsize   res faults  qrw  arw net_in net_out conn set repl                time
1 96 *0 *0 0 70|0 0 282G 10.9G 597 0|0 2|0 59.1k 1.06m 484 rs0 PRI Jan 10 07:48:45.061
1 103 3 *0 0 79|0 0 282G 10.9G 715 0|0 0|0 66.4k 176k 484 rs0 PRI Jan 10 07:48:46.060
2 102 *0 *0 0 72|0 0 282G 10.9G 718 0|0 2|0 59.6k 154k 484 rs0 PRI Jan 10 07:48:47.060
2 104 2 1 0 83|0 0 282G 11.0G 692 0|0 3|0 68.7k 154k 483 rs0 PRI Jan 10 07:48:48.061
2 103 2 *0 0 86|0 0 282G 11.0G 743 0|0 1|0 64.2k 154k 482 rs0 PRI Jan 10 07:48:49.059
*0 116 2 *0 0 95|0 0 282G 11.0G 715 0|0 2|0 68.5k 237k 484 rs0 PRI Jan 10 07:48:50.060
1 112 6 1 0 78|0 0 282G 11.0G 595 0|0 2|0 74.3k 231k 481 rs0 PRI Jan 10 07:48:51.061
2 137 5 1 0 83|0 0 282G 11.0G 733 0|0 3|0 81.6k 301k 480 rs0 PRI Jan 10 07:48:52.060
3 157 5 1 0 92|0 0 282G 11.0G 697 0|0 3|0 89.5k 183k 481 rs0 PRI Jan 10 07:48:53.059
4 102 3 *0 0 73|0 0 282G 10.9G 599 0|0 1|0 67.1k 154k 480 rs0 PRI Jan 10 07:48:54.060


We have about 15GB of free memory that MongoDB does not use!



On peak we reach 400 OPS, and 500 connections per seconds. Is there anything I could do to improve the performance?





EDIT



iostat output of disk:



Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda 0.00 87.00 7.00 392.00 460.00 2544.00 15.06 0.08 0.22 8.00 0.08 0.12 4.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 7.00 478.00 460.00 2536.00 12.35 0.09 0.19 8.00 0.08 0.10 4.80

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 84.00 6.00 3620.00 56.00 81.69 0.82 8.84 9.48 0.00 4.13 37.20
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 85.00 6.00 3692.00 56.00 82.37 0.82 8.75 9.36 0.00 4.09 37.20

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 57.00 3.00 2804.00 52.00 95.20 0.44 7.73 7.79 6.67 4.80 28.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 56.00 3.00 2732.00 52.00 94.37 0.44 7.80 7.86 6.67 4.88 28.80


Another snapshot:



Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda 0.00 11.00 267.67 5.00 20813.33 97.33 153.38 1.28 4.68 4.70 3.73 3.56 97.07
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 267.67 16.00 20806.67 97.33 147.38 1.28 4.50 4.70 1.17 3.42 97.07

Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 21.00 221.67 3.00 19089.33 110.67 170.92 1.18 5.26 5.16 12.44 3.95 88.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 221.33 24.00 19061.33 110.67 156.29 1.18 4.82 5.17 1.56 3.62 88.80




EDIT 2:



The output of iostat 1 in order to remove extra information based on read and write:



https://pastebin.com/32WuRSPF



Another result which tps reached around 1000:



https://pastebin.com/mrQdKZZF







index query-performance mongodb






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Jan 13 '18 at 14:08







ALH

















asked Jan 10 '18 at 13:25









ALHALH

1,67682737




1,67682737





bumped to the homepage by Community 10 mins ago


This question has answers that may be good or bad; the system has marked it active so that they can be reviewed.







bumped to the homepage by Community 10 mins ago


This question has answers that may be good or bad; the system has marked it active so that they can be reviewed.










  • 1





    Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.

    – JJussi
    Jan 12 '18 at 5:43











  • @JJussi Please see the edit part of the question, I have added iostat output.

    – ALH
    Jan 13 '18 at 8:59











  • @JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?

    – ALH
    Jan 13 '18 at 10:25











  • ?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.

    – JJussi
    Jan 13 '18 at 12:51






  • 1





    No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.

    – JJussi
    Jan 15 '18 at 4:13














  • 1





    Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.

    – JJussi
    Jan 12 '18 at 5:43











  • @JJussi Please see the edit part of the question, I have added iostat output.

    – ALH
    Jan 13 '18 at 8:59











  • @JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?

    – ALH
    Jan 13 '18 at 10:25











  • ?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.

    – JJussi
    Jan 13 '18 at 12:51






  • 1





    No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.

    – JJussi
    Jan 15 '18 at 4:13








1




1





Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.

– JJussi
Jan 12 '18 at 5:43





Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.

– JJussi
Jan 12 '18 at 5:43













@JJussi Please see the edit part of the question, I have added iostat output.

– ALH
Jan 13 '18 at 8:59





@JJussi Please see the edit part of the question, I have added iostat output.

– ALH
Jan 13 '18 at 8:59













@JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?

– ALH
Jan 13 '18 at 10:25





@JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?

– ALH
Jan 13 '18 at 10:25













?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.

– JJussi
Jan 13 '18 at 12:51





?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.

– JJussi
Jan 13 '18 at 12:51




1




1





No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.

– JJussi
Jan 15 '18 at 4:13





No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.

– JJussi
Jan 15 '18 at 4:13










1 Answer
1






active

oldest

votes


















0














High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.



I suggest making sure your system complies with the Recommended Configuration as per Production Notes.






share|improve this answer
























  • How can I set readahead in MongoDB? Is there a way to set this on MongoDB MMAPV1?

    – ALH
    Feb 24 '18 at 8:25











  • RA in my disk server is set to 256, I checked it with blockdev --report command. Is that a high number? Should I lower the value for RA (readahead)?

    – ALH
    Feb 24 '18 at 8:30











Your Answer








StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "182"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdba.stackexchange.com%2fquestions%2f195065%2fwhy-mongodb-is-so-slow-on-some-queries-while-using-ixscan-as-a-winning-plan%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown

























1 Answer
1






active

oldest

votes








1 Answer
1






active

oldest

votes









active

oldest

votes






active

oldest

votes









0














High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.



I suggest making sure your system complies with the Recommended Configuration as per Production Notes.






share|improve this answer
























  • How can I set readahead in MongoDB? Is there a way to set this on MongoDB MMAPV1?

    – ALH
    Feb 24 '18 at 8:25











  • RA in my disk server is set to 256, I checked it with blockdev --report command. Is that a high number? Should I lower the value for RA (readahead)?

    – ALH
    Feb 24 '18 at 8:30
















0














High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.



I suggest making sure your system complies with the Recommended Configuration as per Production Notes.






share|improve this answer
























  • How can I set readahead in MongoDB? Is there a way to set this on MongoDB MMAPV1?

    – ALH
    Feb 24 '18 at 8:25











  • RA in my disk server is set to 256, I checked it with blockdev --report command. Is that a high number? Should I lower the value for RA (readahead)?

    – ALH
    Feb 24 '18 at 8:30














0












0








0







High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.



I suggest making sure your system complies with the Recommended Configuration as per Production Notes.






share|improve this answer













High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.



I suggest making sure your system complies with the Recommended Configuration as per Production Notes.







share|improve this answer












share|improve this answer



share|improve this answer










answered Jan 16 '18 at 1:07









VaudevilleVillainVaudevilleVillain

1




1













  • How can I set readahead in MongoDB? Is there a way to set this on MongoDB MMAPV1?

    – ALH
    Feb 24 '18 at 8:25











  • RA in my disk server is set to 256, I checked it with blockdev --report command. Is that a high number? Should I lower the value for RA (readahead)?

    – ALH
    Feb 24 '18 at 8:30



















  • How can I set readahead in MongoDB? Is there a way to set this on MongoDB MMAPV1?

    – ALH
    Feb 24 '18 at 8:25











  • RA in my disk server is set to 256, I checked it with blockdev --report command. Is that a high number? Should I lower the value for RA (readahead)?

    – ALH
    Feb 24 '18 at 8:30

















How can I set readahead in MongoDB? Is there a way to set this on MongoDB MMAPV1?

– ALH
Feb 24 '18 at 8:25





How can I set readahead in MongoDB? Is there a way to set this on MongoDB MMAPV1?

– ALH
Feb 24 '18 at 8:25













RA in my disk server is set to 256, I checked it with blockdev --report command. Is that a high number? Should I lower the value for RA (readahead)?

– ALH
Feb 24 '18 at 8:30





RA in my disk server is set to 256, I checked it with blockdev --report command. Is that a high number? Should I lower the value for RA (readahead)?

– ALH
Feb 24 '18 at 8:30


















draft saved

draft discarded




















































Thanks for contributing an answer to Database Administrators Stack Exchange!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdba.stackexchange.com%2fquestions%2f195065%2fwhy-mongodb-is-so-slow-on-some-queries-while-using-ixscan-as-a-winning-plan%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

SQL Server 17 - Attemping to backup to remote NAS but Access is denied

Always On Availability groups resolving state after failover - Remote harden of transaction...

Restoring from pg_dump with foreign key constraints