Best guess is that this is related to readdir chunking refcounting. 
Pulling all the chunking fixes is likely to fix this.  For example, this 
one:
commit f0d5b8d4f6dcce4597459c3ebaf36d1e96e4645b
MDCACHE - Fix chunk refcounting in readdir
Also this one:
11e0e375e40658267cbf449afacaa53a136f7097
MDCACHE - Fix race between lru functions for the chunk and the parent
Daniel
On 1/29/20 6:23 AM, Madhu P Punjabi wrote:
 Hi All,
 A customer using 2.7.6 reported a crash inmdcache_lru_clean(..) when 
 "entry->sub_handle" had an unexpected address.
 It is happening when lru_reap_chunk_impl(..) calls mdcache_put(..) in 
 the following code:
 / 815                 if (entry == parent ||
   816                     pthread_rwlock_trywrlock(&entry->content_lock) 
 == 0) {
   ...
   819                         if (entry != parent) {
   ...
   823                                 (void) 
 atomic_inc_int32_t(&entry->lru.refcnt);
   824                         }
   825
   826                         /* Dequeue the chunk so it won't show up 
 anymore */
   827                         CHUNK_LRU_DQ_SAFE(lru, lq);
   828                         chunk->chunk_lru.qid = LRU_ENTRY_NONE;
   ...
   839                         mdcache_clean_dirent_chunk(chunk);
   840                         atomic_clear_uint32_t_bits(&entry->mde_flags,
   841 
                                                     MDCACHE_DIR_POPULATED);
   842
   843                         if (entry != parent) {
   ...
   848 
                                  PTHREAD_RWLOCK_unlock(&entry->content_lock);
 * 849                                 mdcache_put(entry);*
   850                         }/
 At the time of crash:
 /*entries_hiwat = 3000000, entries_used = 3000000, chunks_hiwat = 
 100000, chunks_used = 100000*/
 Has anybody seen this kind of crash ? Any patches that may help to fix 
 the crash ?  We have the customer coredump, so can provide more 
 information using it.
 Backtrace for reference:
 /(gdb) bt
 #0  0x00003fff955294a4 in .raise () from /lib64/libpthread.so.0
 #1  0x000000001005b564 in crash_handler (signo=11, info=0x3ffd73eebee8, 
 ctx=0x3ffd73eeb170) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_init.c:244
 #2  <signal handler called>
 #3  0x00000000101cf5fc in mdcache_lru_clean (entry=0x3ffd78f1d540) at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:581
 #4  0x00000000101d66ec in _mdcache_lru_unref (entry=0x3ffd78f1d540, 
 flags=0, func=0x10262100 <__func__.21092> "mdcache_put", line=196)
      at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:2023
 #5  0x00000000101cda88 in mdcache_put (entry=0x3ffd78f1d540) at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.h:196
 #6  0x00000000101d0f3c in lru_reap_chunk_impl (qid=LRU_ENTRY_L2, 
 parent=0x3ffee854d080)
      at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:849
 #7  0x00000000101d1314 in mdcache_get_chunk (parent=0x3ffee854d080, 
 prev_chunk=0x0, whence=0)
      at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:893
 #8  0x00000000101f6c78 in mdcache_populate_dir_chunk 
 (directory=0x3ffee854d080, whence=0, dirent=0x3ffd73eec988, 
 prev_chunk=0x0, eod_met=0x3ffd73eec990)
      at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:2572
 #9  0x00000000101f90a4 in mdcache_readdir_chunked 
 (directory=0x3ffee854d080, whence=0, dir_state=0x3ffd73eecc18, 
 cb=@0x1028aaa8: 0x10044310 <populate_dirent>, attrmask=122830,
      eod_met=0x3ffd73eecd64) at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3009
 #10 0x00000000101dfeb4 in mdcache_readdir (dir_hdl=0x3ffee854d0b8, 
 whence=0x3ffd73eeccd0, dir_state=0x3ffd73eecc18, cb=@0x1028aaa8: 
 0x10044310 <populate_dirent>,
      attrmask=122830, eod_met=0x3ffd73eecd64) at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:559
 #11 0x0000000010044ff4 in fsal_readdir (directory=0x3ffee854d0b8, 
 cookie=0, nbfound=0x3ffd73eecd60, eod_met=0x3ffd73eecd64, attrmask=122830,
      cb=@0x102907c8: 0x100dcfe8 <nfs3_readdirplus_callback>, 
 opaque=0x3ffd73eecd80) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/fsal_helper.c:1160
 #12 0x00000000100dcd64 in nfs3_readdirplus (arg=0x3ff9e8fa79c8, 
 req=0x3ff9e8fa72c0, res=0x3ff9e8ca4b50)
      at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/Protocols/NFS/nfs3_readdirplus.c:310
 #13 0x00000000100867f0 in nfs_rpc_process_request 
 (reqdata=0x3ff9e8fa72c0) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_worker_thread.c:1325
 #14 0x0000000010087340 in nfs_rpc_valid_NFS (req=0x3ff9e8fa72c0) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_worker_thread.c:1545
 #15 0x00003fff95a0d448 in svc_vc_decode (req=0x3ff9e8fa72c0) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_vc.c:829
 #16 0x0000000010070f74 in nfs_rpc_decode_request (xprt=0x3ffcb0eeba00, 
 xdrs=0x3ff9e811f090)
      at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_rpc_dispatcher_thread.c:1345
 #17 0x00003fff95a0d2a0 in svc_vc_recv (xprt=0x3ffcb0eeba00) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_vc.c:802
 #18 0x00003fff95a07884 in svc_rqst_xprt_task (wpe=0x3ffcb0eebc58) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:769
 #19 0x00003fff95a07f20 in svc_rqst_epoll_events (sr_rec=0x1002bca2390, 
 n_events=1) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:941
 #20 0x00003fff95a082e4 in svc_rqst_epoll_loop (sr_rec=0x1002bca2390) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:1014
 #21 0x00003fff95a08428 in svc_rqst_run_task (wpe=0x1002bca2390) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:1050
 #22 0x00003fff95a168fc in work_pool_thread (arg=0x3ffd80001f40) at 
 /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/work_pool.c:181
 #23 0x00003fff9551c99c in .start_thread () from /lib64/libpthread.so.0
 #24 0x00003fff953c781c in .__clone () from /lib64/libc.so.6/
 
 /(gdb) frame 3
 #3  0x00000000101cf5fc in mdcache_lru_clean (entry=0x3ffd78f1d540) at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:581
 581                     subcall(
 (gdb) list
 576                             LogCrit(COMPONENT_CACHE_INODE_LRU,
 577                                     "Error closing file in cleanup: %s",
 578                                     fsal_err_txt(status));
 579                     }
 580
 581                     subcall(
582                             entry->sub_handle->obj_ops->release(entry->sub_handle)
 583                            );
 584                     entry->sub_handle = NULL;
 585
 *(gdb) p entry->sub_handle
 $5 = (struct fsal_obj_handle *) 0x1e489f3800000000  --------> this has 
 unexpected address, accessing it causing crash*/
 /(gdb) frame 6
 #6  0x00000000101d0f3c in lru_reap_chunk_impl (qid=LRU_ENTRY_L2, 
 parent=0x3ffee854d080)
      at 
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:849
 849                                     mdcache_put(entry);
 (gdb) p entry
 $13 = (mdcache_entry_t *) 0x3ffd78f1d540
 (gdb) p parent
 $14 = (mdcache_entry_t *) 0x3ffee854d080/
 /*(gdb) p lru_state
 $15 = {entries_hiwat = 3000000, entries_used = 3000000, chunks_hiwat = 
 100000, chunks_used = 100000, fds_system_imposed = 160000, 
 fds_hard_limit = 144000, fds_hiwat = 96000,
    fds_lowat = 32000, futility = 0, per_lane_work = 50, biggest_window = 
 64000, prev_fd_count = 38875, prev_time = 1580013456, fd_state = 0}*/
 
 /(gdb) p *entry
 $7 = {attr_lock = {__data = {__lock = 16381, __nr_readers = 2024405152, 
 __readers_wakeup = 16381, __writer_wakeup = 2019009936, 
 __nr_readers_queued = 0,
        __nr_writers_queued = 0, __writer = 0, __shared = 0, __pad1 = 
 2860448219136, __pad2 = 228761534566, __flags = 805318656},
      __size = "\000\000?\375x\251\370\240\000\000?\375xW\245\220", 
 '\000' <repeats 18 times>, 
 "\002\232\000\000\000\000\000\000\000\065C@\000f0\000\060\000\n\000\002",
      __align = 70357883680928}, obj_handle = {handles = {next = 
 0x2886290afa5892f8, prev = 0x3d01000001000000}, fs = 0x3013836, fsal = 
 0x6e0000000074e963,
      obj_ops = 0x8400000003013833, obj_lock = {__data = {__lock = 
 318767104, __nr_readers = 3, __readers_wakeup = 20461166, 
 __writer_wakeup = 448, __nr_readers_queued = 1,
          __nr_writers_queued = 18, __writer = 1146909539, __shared = 
 1751476595, __pad1 = 7596798812954843694, __pad2 = 4848124998864338967, 
 __flags = 1146905448},
        __size = 
"\023\000\000\000\000\000\000\003\001\070\066n\000\000\001\300\000\000\000\001\000\000\000\022D\\XXXXXXXX(a)XXX.XX\000\000\000\000\000\027D\\chab-p",
        __align = 1369094286720630787}, type = 1752789805, fsid = {major 
 = 4995697438821140480, minor = 0}, fileid = 512, state_hdl = 0x5dfc43d5},
    sub_handle = 0x1e489f3800000000, attrs = {request_mask = 
 6677607623807894576, valid_mask = 1236774467, supported = 3, type = 
 20461166, filesize = 41, fsid = {
        major = 9667270145471608893, minor = 932007903232}, acl = 0x0, 
 fileid = 31387111933, mode = 384, numlinks = 2, owner = 172197, group = 
 194403, rawdev = {major = 0,
        minor = 0}, atime = {tv_sec = 1580013344, tv_nsec = 192799000}, 
 creation = {tv_sec = 0, tv_nsec = 0}, ctime = {tv_sec = 1580013344, 
 tv_nsec = 197238000}, mtime = {
        tv_sec = 1580013344, tv_nsec = 195122884}, chgtime = {tv_sec = 
 1580013344, tv_nsec = 197238000}, spaceused = 512, change = 1777251344, 
 generation = 0,
      expire_time_attr = 60, fs_locations = 0x0, sec_label = {slai_lfs = 
 {lfs_lfs = 0, lfs_pi = 0}, slai_data = {slai_data_len = 0, slai_data_val 
 = 0x0}}}, fh_hk = {node_k = {
        left = 0x0, right = 0x0, parent = 70361210201962}, key = {hk = 
 18326489524089677003, fsal = 0x3fff90ec0000 <GPFS>, kv = {addr = 0x0, 
 len = 0}}, inavl = false},
    mde_flags = 2, attr_time = 1580013344, acl_time = 1580013344, 
 fs_locations_time = 0, lru = {q = {next = 0x0, prev = 0x0}, qid = 
 LRU_ENTRY_CLEANUP, refcnt = 0, flags = 3,
      lane = 66, cf = 0}, export_list = {next = 0x3ffd78f1d790, prev = 
 0x3ffd78f1d790}, first_export_id = -1, content_lock = {__data = {__lock 
 = 0, __nr_readers = 0,
        __readers_wakeup = 0, __writer_wakeup = 0, __nr_readers_queued = 
 0, __nr_writers_queued = 0, __writer = 0, __shared = 0, __pad1 = 0, 
 __pad2 = 0, __flags = 0},
      __size = '\000' <repeats 55 times>, __align = 0}, fsobj = {hdl = 
 {state_lock = {__data = {__lock = 0, __nr_readers = 0, __readers_wakeup 
 = 0, __writer_wakeup = 0,
            __nr_readers_queued = 0, __nr_writers_queued = 0, __writer = 
 0, __shared = 0, __pad1 = 0, __pad2 = 0, __flags = 0}, __size = '\000' 
 <repeats 55 times>, __align = 0},
        no_cleanup = false, {file = {obj = 0x3ffd78f1d578, list_of_states 
 = {next = 0x3ffd78f1d828, prev = 0x3ffd78f1d828}, layoutrecall_list = 
 {next = 0x3ffd78f1d838,
              prev = 0x3ffd78f1d838}, lock_list = {next = 0x3ffd78f1d848, 
 prev = 0x3ffd78f1d848}, nlm_share_list = {next = 0x3ffd78f1d858, prev = 
 0x3ffd78f1d858},
            write_delegated = false, fdeleg_stats = {fds_curr_delegations 
 = 0, fds_deleg_type = OPEN_DELEGATE_NONE, fds_delegation_count = 0, 
 fds_recall_count = 0,
              fds_avg_hold = 0, fds_last_delegation = 0, fds_last_recall 
 = 0, fds_num_opens = 0, fds_first_open = 0}, anon_ops = 0}, dir = 
 {junction_export = 0x3ffd78f1d578,
            export_roots = {next = 0x3ffd78f1d828, prev = 
 0x3ffd78f1d828}, exp_root_refcount = 16381}}}, fsdir = {chunks = {next = 
 0x0, prev = 0x0}, detached = {next = 0x0,
          prev = 0x0}, spin = 0, detached_count = 0, dhdl = {state_lock = 
 {__data = {__lock = 0, __nr_readers = 0, __readers_wakeup = 0, 
 __writer_wakeup = 0,
              __nr_readers_queued = 0, __nr_writers_queued = 0, __writer 
 = 16381, __shared = 2029114744, __pad1 = 70357888391208, __pad2 = 
 70357888391208, __flags = 16381},
            __size = '\000' <repeats 26 times>, 
"?\375x\361\325x\000\000?\375x\361\330(\000\000?\375x\361\330(\000\000?\375x\361\330\070",
 __align = 0}, no_cleanup = false, {
            file = {obj = 0x3ffd78f1d848, list_of_states = {next = 
 0x3ffd78f1d848, prev = 0x3ffd78f1d858}, layoutrecall_list = {next = 
 0x3ffd78f1d858, prev = 0x0}, lock_list = {
                next = 0x0, prev = 0x0}, nlm_share_list = {next = 0x0, 
 prev = 0x0}, write_delegated = false, fdeleg_stats = 
 {fds_curr_delegations = 0,
                fds_deleg_type = OPEN_DELEGATE_NONE, fds_delegation_count 
 = 0, fds_recall_count = 0, fds_avg_hold = 0, fds_last_delegation = 0, 
 fds_last_recall = 0,
                fds_num_opens = 0, fds_first_open = 0}, anon_ops = 0}, 
 dir = {junction_export = 0x3ffd78f1d848, export_roots = {next = 
 0x3ffd78f1d848, prev = 0x3ffd78f1d858},
              exp_root_refcount = 16381}}}, parent = {addr = 0x0, len = 
 0}, parent_time = 1580009483, first_ck = 0, avl = {t = {root = 0x0,
            cmp_fn = @0x10299918: 0x101fbfa0 <avl_dirent_name_cmpf>, 
 height = -1, first = 0x0, last = 0x0, size = 0}, ck = {root = 0x0,
            cmp_fn = @0x10299928: 0x101fc08c <avl_dirent_ck_cmpf>, height 
 = -1, first = 0x0, last = 0x61, size = 70357874528096}, sorted = {root = 
 0x3ffd7857a9a0,
            cmp_fn = @0x10299938: 0x101fc140 <avl_dirent_sorted_cmpf>, 
 height = -1, first = 0x0, last = 0x0, size = 0}, collisions = 0}}}} /
 
 Thanks,
 Madhu Thorat.