Hello,
I'm hitting a scenario where the entry at the LRU end of L2 queue becomes active. But
we don't move it to L1 - likely because the entry becomes active in the context of a
readdir. The cache keeps growing to a point where kernel will invoke oom killer to
terminate ganesha process.
When we reap entries (lru_reap_impl), could we look beyond LRU end - perhaps try a fixed
number of entries? Another option is to garbage collect the L2 queue also and free
claimable entries beyond LRU end of the queue(through mdcache_lru_release_entries()). Any
other thoughts?
In the instance below, MDCache is supposed to be capped at 100K entries. But it grows to
> 5 million entries (~17*310K).
sudo gdb -q -p $(pidof ganesha.nfsd) -batch -ex 'p LRU[0].L1' -ex 'p
LRU[0].L2' -ex 'p LRU[1].L1' -ex 'p LRU[1].L2' -ex 'p
LRU[2].L1' -ex 'p LRU[2].L2' -ex 'p LRU[3].L1' -ex 'p
LRU[3].L2' -ex 'p LRU[4].L1' -ex 'p LRU[4].L2' -ex 'p
LRU[5].L1' -ex 'p LRU[5].L2' -ex 'p LRU[6].L1' -ex 'p
LRU[6].L2'
$1 = {q = {next = 0x7fe16a6adc30, prev = 0x7fe066775d30}, id = LRU_ENTRY_L1, size = 37}
$2 = {q = {next = 0x7fe0cd6d1130, prev = 0x7fdd595e2030}, id = LRU_ENTRY_L2, size =
310609}
$3 = {q = {next = 0x7fe222cc7930, prev = 0x7fe0e8afaf30}, id = LRU_ENTRY_L1, size = 37}
$4 = {q = {next = 0x7fdfa2022d30, prev = 0x7fe01c386b30}, id = LRU_ENTRY_L2, size =
310459}
$5 = {q = {next = 0x7fdfdd8acb30, prev = 0x7fe233849b30}, id = LRU_ENTRY_L1, size = 31}
$6 = {q = {next = 0x7fdf014e7e30, prev = 0x7fdd90fd7430}, id = LRU_ENTRY_L2, size =
310297}
$7 = {q = {next = 0x7fde79a4f030, prev = 0x7fe233a4aa30}, id = LRU_ENTRY_L1, size = 32}
$8 = {q = {next = 0x7fe061388430, prev = 0x7fdd24b5cf30}, id = LRU_ENTRY_L2, size =
310659}
$9 = {q = {next = 0x7fe1e96ce430, prev = 0x7fe0b3b4b130}, id = LRU_ENTRY_L1, size = 34}
$10 = {q = {next = 0x7fe00d84ff30, prev = 0x7fdd685b1530}, id = LRU_ENTRY_L2, size =
310635}
$11 = {q = {next = 0x7fdf9df4fb30, prev = 0x7fe2414aaa30}, id = LRU_ENTRY_L1, size = 33}
$12 = {q = {next = 0x7fe165e82d30, prev = 0x7fdf1d2b8a30}, id = LRU_ENTRY_L2, size =
310566}
$13 = {q = {next = 0x7fe159e55a30, prev = 0x7fde3f973d30}, id = LRU_ENTRY_L1, size = 41}
$14 = {q = {next = 0x7fdf4fbb9030, prev = 0x7fdea8ca0730}, id = LRU_ENTRY_L2, size =
310460}
First entry has a ref of 2. But next entries are actually claimable.
sudo gdb -q -p $(pidof ganesha.nfsd) -batch -ex 'p *(mdcache_lru_t
*)LRU[0].L2.q.next'
$1 = {q = {next = 0x7fe0fbff0c30, prev = 0x7fe250de2960 <LRU+32>}, qid =
LRU_ENTRY_L2, refcnt = 2, flags = 0, lane = 0, cf = 0}
sudo gdb -q -p $(pidof ganesha.nfsd) -batch -ex 'p *(mdcache_lru_t
*)0x7fe0fbff0c30'
$1 = {q = {next = 0x7fe0c2c5a130, prev = 0x7fe0cd6d1130}, qid = LRU_ENTRY_L2, refcnt = 1,
flags = 0, lane = 0, cf = 0}
sudo gdb -q -p $(pidof ganesha.nfsd) -batch -ex 'p *(mdcache_lru_t
*)0x7fe0c2c5a130'
$1 = {q = {next = 0x7fe06dfeac30, prev = 0x7fe142936430}, qid = LRU_ENTRY_L2, refcnt = 1,
flags = 0, lane = 0, cf = 0}