该机器是我的mongodb群集的一个分片。集群有三个分片,machine1中的shard1和machine2中的shard2,8G内存和800G磁盘。 machine3中的configdb,mongos和shard3,特别是16G内存和400G磁盘。
现在的问题是:
mongostat在machine3中是正常的,但在machine1和machine2中,页面错误和锁定的数据库总是很高。
我只列出了machine1的一些状态: top命令的结果:
[]$top
Cpu(s): 0.2%us, 0.2%sy, 0.0%ni, 99.2%id, 0.3%wa, 0.0%hi, 0.0%si, 0.2%st
Mem: 7633792k total, 7302168k used, 331624k free, 84456k buffers
Swap: 0k total, 0k used, 0k free, 6209852k cached
PID PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
7562 20 0 200g 1.0g 702m S 0.3 14.3 4:36.50 mongod
这是mongostat:
insert query update delete getmore command flushes mapped vsize res non-mapped faults locked db idx miss % qr|qw ar|aw netIn netOut conn time
0 2 12 0 0 13 0 100g 201g 911m 101g 11 amazon:38.9% 0 3|0 1|0 4k 4k 14 01:45:35
0 0 3 0 0 7 1 100g 201g 912m 101g 28 amazon:1.2% 0 0|0 0|0 1k 3k 14 01:45:36
0 2 14 0 0 15 0 100g 201g 912m 101g 93 amazon:4.6% 0 0|0 0|0 7k 4k 14 01:45:37
0 0 0 0 0 1 0 100g 201g 911m 101g 141 amazon:0.2% 0 0|0 0|0 62b 2k 14 01:45:38
0 6 24 0 0 25 0 100g 201g 913m 101g 123 amazon:6.4% 0 0|0 0|0 8k 6k 14 01:45:39
0 1 9 0 0 10 0 100g 201g 912m 101g 33 amazon:4.2% 0 0|0 0|0 3k 3k 14 01:45:40
0 12 59 0 0 58 0 100g 201g 914m 101g 108 amazon:30.0% 0 1|0 0|1 24k 12k 14 01:45:41
0 20 93 0 0 96 0 100g 201g 911m 101g 114 amazon:36.1% 0 0|0 0|0 33k 17k 14 01:45:42
0 19 84 0 0 86 0 100g 201g 913m 101g 103 amazon:43.9% 0 0|0 1|0 28k 16k 14 01:45:43
0 9 29 0 0 26 0 100g 201g 914m 101g 37 amazon:5.5% 0 5|0 0|1 11k 6k 14 01:45:44
这是服务器状态:
> db.serverStatus()
{
"host" : "XX-XX-XX-XX:25018",
"version" : "2.2.3",
"process" : "mongod",
"pid" : 7562,
"uptime" : 1410,
"uptimeMillis" : NumberLong(1410211),
"uptimeEstimate" : 1390,
"localTime" : ISODate("2013-03-22T01:49:01.459Z"),
"locks" : {
"." : {
"timeLockedMicros" : {
"R" : NumberLong(563437),
"W" : NumberLong(22798453)
},
"timeAcquiringMicros" : {
"R" : NumberLong(303677814),
"W" : NumberLong(59991149)
}
},
"admin" : {
"timeLockedMicros" : {
},
"timeAcquiringMicros" : {
}
},
"local" : {
"timeLockedMicros" : {
"r" : NumberLong(6613),
"w" : NumberLong(0)
},
"timeAcquiringMicros" : {
"r" : NumberLong(1937433),
"w" : NumberLong(0)
}
},
"amazon" : {
"timeLockedMicros" : {
"r" : NumberLong(203845605),
"w" : NumberLong(651848025)
},
"timeAcquiringMicros" : {
"r" : NumberLong(621538184),
"w" : NumberLong(1525509360)
}
},
"test" : {
"timeLockedMicros" : {
"r" : NumberLong(5143),
"w" : NumberLong(999532)
},
"timeAcquiringMicros" : {
"r" : NumberLong(157712),
"w" : NumberLong(60)
}
}
},
"globalLock" : {
"totalTime" : NumberLong(1410211000),
"lockTime" : NumberLong(22798453),
"currentQueue" : {
"total" : 0,
"readers" : 0,
"writers" : 0
},
"activeClients" : {
"total" : 0,
"readers" : 0,
"writers" : 0
}
},
"mem" : {
"bits" : 64,
"resident" : 945,
"virtual" : 205577,
"supported" : true,
"mapped" : 102383,
"mappedWithJournal" : 204766
},
"connections" : {
"current" : 14,
"available" : 805
},
"extra_info" : {
"note" : "fields vary by platform",
"heap_usage_bytes" : 190782680,
"page_faults" : 68002
},
"indexCounters" : {
"btree" : {
"accesses" : 274412,
"hits" : 274412,
"misses" : 0,
"resets" : 0,
"missRatio" : 0
}
},
"backgroundFlushing" : {
"flushes" : 23,
"total_ms" : 89781,
"average_ms" : 3903.521739130435,
"last_ms" : 929,
"last_finished" : ISODate("2013-03-22T01:48:32.243Z")
},
"cursors" : {
"totalOpen" : 0,
"clientCursors_size" : 0,
"timedOut" : 0
},
"network" : {
"bytesIn" : 11325630,
"bytesOut" : 181775584,
"numRequests" : 67850
},
"opcounters" : {
"insert" : 157,
"query" : 6898,
"update" : 29954,
"delete" : 0,
"getmore" : 0,
"command" : 30902
},
"asserts" : {
"regular" : 0,
"warning" : 0,
"msg" : 0,
"user" : 1,
"rollovers" : 0
},
"writeBacksQueued" : false,
"dur" : {
"commits" : 27,
"journaledMB" : 0.36864,
"writeToDataFilesMB" : 1.241313,
"compression" : 0.2963027264769924,
"commitsInWriteLock" : 0,
"earlyCommits" : 0,
"timeMs" : {
"dt" : 3269,
"prepLogBuffer" : 0,
"writeToJournal" : 442,
"writeToDataFiles" : 4,
"remapPrivateView" : 23
}
},
"recordStats" : {
"accessesNotInMemory" : 32752,
"pageFaultExceptionsThrown" : 1656,
"amazon" : {
"accessesNotInMemory" : 32752,
"pageFaultExceptionsThrown" : 1656
},
"local" : {
"accessesNotInMemory" : 0,
"pageFaultExceptionsThrown" : 0
},
"test" : {
"accessesNotInMemory" : 0,
"pageFaultExceptionsThrown" : 0
}
},
"ok" : 1
}
有人给我一些建议吗?非常感谢。
答案 0 :(得分:1)
我有类似的问题。服务器有时开始挂起。我在我的查询中做了一些改变,希望它能解决。现在我不那么频繁地得到这些错误了。
我做了什么:
希望这些提示有所帮助。