Hello,

Not sure if my previous mail was sent, hence re-sending the mail.

We saw a crash at the following line in mdcache_get_chunk() as prev_chunk's dirent list is empty.
chunk->reload_ck = glist_last_entry(&prev_chunk->dirents,
                                    mdcache_dir_entry_t,
                                    chunk_list)->ck;

The backtrace of the coredump is at the end of the mail.

I could reproduce similar crash by doing the following:
1. In mdcache_readdir_chunked inserted a sleep() after the content_lock is released and before the content_lock is acquired for writing as follows:

   2805 again:
   2806         /* Get here on first pass, retry if we don't hold the write lock,
   2807          * and repeated passes if we need to fetch another chunk.
   2808          */
   2809
   2810         LogFullDebugAlt(COMPONENT_NFS_READDIR, COMPONENT_CACHE_INODE,
   2811                         "Readdir chunked next_ck=0x%"PRIx64" look_ck=%"PRIx64,
   2812                         next_ck, look_ck);
   2813
   2814         if (look_ck == 0 ||
   2815             !mdcache_avl_lookup_ck(directory, look_ck, &dirent)) {
   2816                 fsal_status_t status;
   2817                 /* This starting position isn't in our cache...
   2818                  * Go populate the cache and process from there.
   2819                  */
   2820                 if (!has_write) {
   2821                         /* Upgrade to write lock and retry just in case
   2822                          * another thread managed to populate this cookie
   2823                          * in the meantime.
   2824                          */
   2825                         PTHREAD_RWLOCK_unlock(&directory->content_lock);
   2826                         sleep(30);                                      // Sleep here
   2827                         PTHREAD_RWLOCK_wrlock(&directory->content_lock);
   2828                         has_write = true;
   2829                         goto again;
   2830                 }

2. From 1st client run 'ls' inside a mounted directory for an export. 'ls' is made to wait as there is sleep() in mdcache_readdir_chunked()
3. Immediately from 2nd client remove all the entries inside the mounted directory for the same export.
4. After sleep() time is over, ganesha crashes as the 'prev_chunk' is not valid in mdcache_readdir_chunked()

Following is the coredump for reference. The code used was ganesha 2.5 and it has patches for 'readdir' taken from https://github.com/dang/nfs-ganesha/tree/v2.5-readdir, the code for mdcache_readdir_chunked() looks similar to mdcache_readdir_chunked() in  2.8

#0  0x00007fae938dc4ab in raise () from /lib64/libpthread.so.0
#1  0x000000000045549e in crash_handler (signo=11,
info=0x7fae25f48eb0, ctx=0x7fae25f48d80) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/MainNFSD/nfs_init.c:225
#2  <signal handler called>
#3  mdcache_get_chunk (parent=0x7faa1001a290,
prev_chunk=0x7fade0206350, whence=2147483647) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:909
#4  0x000000000054fbe9 in mdcache_populate_dir_chunk
(directory=0x7faa1001a290, whence=2147483647, dirent=0x7fae25f49680,
prev_chunk=0x7fade0206350, eod_met=0x7fae25f4967f) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:2659
#5  0x0000000000551767 in mdcache_readdir_chunked
(directory=0x7faa1001a290, whence=2147483647,
dir_state=0x7fae25f49990, cb=0x43310f <populate_dirent>, attrmask=0,
eod_met=0x7fae25f49e8b) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3053
#6  0x000000000053f39f in mdcache_readdir (dir_hdl=0x7faa1001a2c8,
whence=0x7fae25f49970, dir_state=0x7fae25f49990, cb=0x43310f
<populate_dirent>, attrmask=0, eod_met=0x7fae25f49e8b) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:639
#7  0x00000000004339f3 in fsal_readdir (directory=0x7faa1001a2c8,
cookie=2147483647, nbfound=0x7fae25f49e8c, eod_met=0x7fae25f49e8b,
attrmask=0, cb=0x495d70 <nfs3_readdir_callback>,
opaque=0x7fae25f49e40) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/FSAL/fsal_helper.c:1502
#8  0x0000000000495b57 in nfs3_readdir (arg=0x7fa8b4f75e80,
req=0x7fa8b4f75678, res=0x7faad82c8c70) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/Protocols/NFS/nfs3_readdir.c:289
#9  0x000000000044ccde in nfs_rpc_execute (reqdata=0x7fa8b4f75650) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/MainNFSD/nfs_worker_thread.c:1290
#10 0x000000000044d4e8 in worker_run (ctx=0x4926600) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/MainNFSD/nfs_worker_thread.c:1562
#11 0x000000000050c57f in fridgethr_start_routine (arg=0x4926600) at
/usr/src/debug/nfs-ganesha-2.5.3-ibm031.00-0.1.1-Source/support/fridgethr.c:550

(gdb) frame 3
(gdb) p *prev_chunk
$9 = {chunks = {next = 0x7faad808a570, prev = 0x7fade00000d8}, dirents
= {next = 0x7fade0206360, prev = 0x7fade0206360}, parent = 0x0,
chunk_lru = {q = {next = 0x0, prev = 0x0}, qid = LRU_ENTRY_L1, refcnt
= 0, flags = 0, lane = 534, cf = 0}, reload_ck = 1453366958, next_ck =
0, num_entries = 112}


To fix this I have posted a patch:
https://review.gerrithub.io/c/ffilz/nfs-ganesha/+/440079

Thanks,
Madhu Thorat.