Hi All,
 
A customer using 2.7.6 reported a crash in mdcache_lru_clean(..) when "entry->sub_handle" had an unexpected address.
 
It is happening when lru_reap_chunk_impl(..) calls mdcache_put(..) in the following code:
 815                 if (entry == parent ||
 816                     pthread_rwlock_trywrlock(&entry->content_lock) == 0) {

 ...
 819                         if (entry != parent) {
 ...
 823                                 (void) atomic_inc_int32_t(&entry->lru.refcnt);
 824                         }
 825
 826                         /* Dequeue the chunk so it won't show up anymore */
 827                         CHUNK_LRU_DQ_SAFE(lru, lq);
 828                         chunk->chunk_lru.qid = LRU_ENTRY_NONE;
 ...
 839                         mdcache_clean_dirent_chunk(chunk);
 840                         atomic_clear_uint32_t_bits(&entry->mde_flags,
 841                                                    MDCACHE_DIR_POPULATED);
 842
 843                         if (entry != parent) {
 ...
 848                                 PTHREAD_RWLOCK_unlock(&entry->content_lock);
 849                                 mdcache_put(entry);
 850                         }

 
At the time of crash:
entries_hiwat = 3000000, entries_used = 3000000, chunks_hiwat = 100000, chunks_used = 100000
 
Has anybody seen this kind of crash ? Any patches that may help to fix the crash ?  We have the customer coredump, so can provide more information using it.
 
Backtrace for reference:
(gdb) bt
#0  0x00003fff955294a4 in .raise () from /lib64/libpthread.so.0
#1  0x000000001005b564 in crash_handler (signo=11, info=0x3ffd73eebee8, ctx=0x3ffd73eeb170) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_init.c:244
#2  <signal handler called>
#3  0x00000000101cf5fc in mdcache_lru_clean (entry=0x3ffd78f1d540) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:581
#4  0x00000000101d66ec in _mdcache_lru_unref (entry=0x3ffd78f1d540, flags=0, func=0x10262100 <__func__.21092> "mdcache_put", line=196)
    at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:2023
#5  0x00000000101cda88 in mdcache_put (entry=0x3ffd78f1d540) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.h:196
#6  0x00000000101d0f3c in lru_reap_chunk_impl (qid=LRU_ENTRY_L2, parent=0x3ffee854d080)
    at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:849
#7  0x00000000101d1314 in mdcache_get_chunk (parent=0x3ffee854d080, prev_chunk=0x0, whence=0)
    at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:893
#8  0x00000000101f6c78 in mdcache_populate_dir_chunk (directory=0x3ffee854d080, whence=0, dirent=0x3ffd73eec988, prev_chunk=0x0, eod_met=0x3ffd73eec990)
    at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:2572
#9  0x00000000101f90a4 in mdcache_readdir_chunked (directory=0x3ffee854d080, whence=0, dir_state=0x3ffd73eecc18, cb=@0x1028aaa8: 0x10044310 <populate_dirent>, attrmask=122830,
    eod_met=0x3ffd73eecd64) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3009
#10 0x00000000101dfeb4 in mdcache_readdir (dir_hdl=0x3ffee854d0b8, whence=0x3ffd73eeccd0, dir_state=0x3ffd73eecc18, cb=@0x1028aaa8: 0x10044310 <populate_dirent>,
    attrmask=122830, eod_met=0x3ffd73eecd64) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:559
#11 0x0000000010044ff4 in fsal_readdir (directory=0x3ffee854d0b8, cookie=0, nbfound=0x3ffd73eecd60, eod_met=0x3ffd73eecd64, attrmask=122830,
    cb=@0x102907c8: 0x100dcfe8 <nfs3_readdirplus_callback>, opaque=0x3ffd73eecd80) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/fsal_helper.c:1160
#12 0x00000000100dcd64 in nfs3_readdirplus (arg=0x3ff9e8fa79c8, req=0x3ff9e8fa72c0, res=0x3ff9e8ca4b50)
    at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/Protocols/NFS/nfs3_readdirplus.c:310
#13 0x00000000100867f0 in nfs_rpc_process_request (reqdata=0x3ff9e8fa72c0) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_worker_thread.c:1325
#14 0x0000000010087340 in nfs_rpc_valid_NFS (req=0x3ff9e8fa72c0) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_worker_thread.c:1545
#15 0x00003fff95a0d448 in svc_vc_decode (req=0x3ff9e8fa72c0) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_vc.c:829
#16 0x0000000010070f74 in nfs_rpc_decode_request (xprt=0x3ffcb0eeba00, xdrs=0x3ff9e811f090)
    at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_rpc_dispatcher_thread.c:1345
#17 0x00003fff95a0d2a0 in svc_vc_recv (xprt=0x3ffcb0eeba00) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_vc.c:802
#18 0x00003fff95a07884 in svc_rqst_xprt_task (wpe=0x3ffcb0eebc58) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:769
#19 0x00003fff95a07f20 in svc_rqst_epoll_events (sr_rec=0x1002bca2390, n_events=1) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:941
#20 0x00003fff95a082e4 in svc_rqst_epoll_loop (sr_rec=0x1002bca2390) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:1014
#21 0x00003fff95a08428 in svc_rqst_run_task (wpe=0x1002bca2390) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:1050
#22 0x00003fff95a168fc in work_pool_thread (arg=0x3ffd80001f40) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/work_pool.c:181
#23 0x00003fff9551c99c in .start_thread () from /lib64/libpthread.so.0
#24 0x00003fff953c781c in .__clone () from /lib64/libc.so.6

(gdb) frame 3
#3  0x00000000101cf5fc in mdcache_lru_clean (entry=0x3ffd78f1d540) at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:581
581                     subcall(
(gdb) list
576                             LogCrit(COMPONENT_CACHE_INODE_LRU,
577                                     "Error closing file in cleanup: %s",
578                                     fsal_err_txt(status));
579                     }
580
581                     subcall(
582                             entry->sub_handle->obj_ops->release(entry->sub_handle)
583                            );
584                     entry->sub_handle = NULL;
585
(gdb) p entry->sub_handle
$5 = (struct fsal_obj_handle *) 0x1e489f3800000000  --------> this has unexpected address, accessing it causing crash
 
(gdb) frame 6
#6  0x00000000101d0f3c in lru_reap_chunk_impl (qid=LRU_ENTRY_L2, parent=0x3ffee854d080)
    at /usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:849
849                                     mdcache_put(entry);
(gdb) p entry
$13 = (mdcache_entry_t *) 0x3ffd78f1d540
(gdb) p parent
$14 = (mdcache_entry_t *) 0x3ffee854d080
(gdb) p lru_state
$15 = {entries_hiwat = 3000000, entries_used = 3000000, chunks_hiwat = 100000, chunks_used = 100000, fds_system_imposed = 160000, fds_hard_limit = 144000, fds_hiwat = 96000,
  fds_lowat = 32000, futility = 0, per_lane_work = 50, biggest_window = 64000, prev_fd_count = 38875, prev_time = 1580013456, fd_state = 0}

(gdb) p *entry
$7 = {attr_lock = {__data = {__lock = 16381, __nr_readers = 2024405152, __readers_wakeup = 16381, __writer_wakeup = 2019009936, __nr_readers_queued = 0,
      __nr_writers_queued = 0, __writer = 0, __shared = 0, __pad1 = 2860448219136, __pad2 = 228761534566, __flags = 805318656},
    __size = "\000\000?\375x\251\370\240\000\000?\375xW\245\220", '\000' <repeats 18 times>, "\002\232\000\000\000\000\000\000\000\065C@\000f0\000\060\000\n\000\002",
    __align = 70357883680928}, obj_handle = {handles = {next = 0x2886290afa5892f8, prev = 0x3d01000001000000}, fs = 0x3013836, fsal = 0x6e0000000074e963,
    obj_ops = 0x8400000003013833, obj_lock = {__data = {__lock = 318767104, __nr_readers = 3, __readers_wakeup = 20461166, __writer_wakeup = 448, __nr_readers_queued = 1,
        __nr_writers_queued = 18, __writer = 1146909539, __shared = 1751476595, __pad1 = 7596798812954843694, __pad2 = 4848124998864338967, __flags = 1146905448},
      __size = "\023\000\000\000\000\000\000\003\001\070\066n\000\000\001\300\000\000\000\001\000\000\000\022D\\XXXXXXXX@XXX.XX\000\000\000\000\000\027D\\chab-p",
      __align = 1369094286720630787}, type = 1752789805, fsid = {major = 4995697438821140480, minor = 0}, fileid = 512, state_hdl = 0x5dfc43d5},
  sub_handle = 0x1e489f3800000000, attrs = {request_mask = 6677607623807894576, valid_mask = 1236774467, supported = 3, type = 20461166, filesize = 41, fsid = {
      major = 9667270145471608893, minor = 932007903232}, acl = 0x0, fileid = 31387111933, mode = 384, numlinks = 2, owner = 172197, group = 194403, rawdev = {major = 0,
      minor = 0}, atime = {tv_sec = 1580013344, tv_nsec = 192799000}, creation = {tv_sec = 0, tv_nsec = 0}, ctime = {tv_sec = 1580013344, tv_nsec = 197238000}, mtime = {
      tv_sec = 1580013344, tv_nsec = 195122884}, chgtime = {tv_sec = 1580013344, tv_nsec = 197238000}, spaceused = 512, change = 1777251344, generation = 0,
    expire_time_attr = 60, fs_locations = 0x0, sec_label = {slai_lfs = {lfs_lfs = 0, lfs_pi = 0}, slai_data = {slai_data_len = 0, slai_data_val = 0x0}}}, fh_hk = {node_k = {
      left = 0x0, right = 0x0, parent = 70361210201962}, key = {hk = 18326489524089677003, fsal = 0x3fff90ec0000 <GPFS>, kv = {addr = 0x0, len = 0}}, inavl = false},
  mde_flags = 2, attr_time = 1580013344, acl_time = 1580013344, fs_locations_time = 0, lru = {q = {next = 0x0, prev = 0x0}, qid = LRU_ENTRY_CLEANUP, refcnt = 0, flags = 3,
    lane = 66, cf = 0}, export_list = {next = 0x3ffd78f1d790, prev = 0x3ffd78f1d790}, first_export_id = -1, content_lock = {__data = {__lock = 0, __nr_readers = 0,
      __readers_wakeup = 0, __writer_wakeup = 0, __nr_readers_queued = 0, __nr_writers_queued = 0, __writer = 0, __shared = 0, __pad1 = 0, __pad2 = 0, __flags = 0},
    __size = '\000' <repeats 55 times>, __align = 0}, fsobj = {hdl = {state_lock = {__data = {__lock = 0, __nr_readers = 0, __readers_wakeup = 0, __writer_wakeup = 0,
          __nr_readers_queued = 0, __nr_writers_queued = 0, __writer = 0, __shared = 0, __pad1 = 0, __pad2 = 0, __flags = 0}, __size = '\000' <repeats 55 times>, __align = 0},
      no_cleanup = false, {file = {obj = 0x3ffd78f1d578, list_of_states = {next = 0x3ffd78f1d828, prev = 0x3ffd78f1d828}, layoutrecall_list = {next = 0x3ffd78f1d838,
            prev = 0x3ffd78f1d838}, lock_list = {next = 0x3ffd78f1d848, prev = 0x3ffd78f1d848}, nlm_share_list = {next = 0x3ffd78f1d858, prev = 0x3ffd78f1d858},
          write_delegated = false, fdeleg_stats = {fds_curr_delegations = 0, fds_deleg_type = OPEN_DELEGATE_NONE, fds_delegation_count = 0, fds_recall_count = 0,
            fds_avg_hold = 0, fds_last_delegation = 0, fds_last_recall = 0, fds_num_opens = 0, fds_first_open = 0}, anon_ops = 0}, dir = {junction_export = 0x3ffd78f1d578,
          export_roots = {next = 0x3ffd78f1d828, prev = 0x3ffd78f1d828}, exp_root_refcount = 16381}}}, fsdir = {chunks = {next = 0x0, prev = 0x0}, detached = {next = 0x0,
        prev = 0x0}, spin = 0, detached_count = 0, dhdl = {state_lock = {__data = {__lock = 0, __nr_readers = 0, __readers_wakeup = 0, __writer_wakeup = 0,
            __nr_readers_queued = 0, __nr_writers_queued = 0, __writer = 16381, __shared = 2029114744, __pad1 = 70357888391208, __pad2 = 70357888391208, __flags = 16381},
          __size = '\000' <repeats 26 times>, "?\375x\361\325x\000\000?\375x\361\330(\000\000?\375x\361\330(\000\000?\375x\361\330\070", __align = 0}, no_cleanup = false, {
          file = {obj = 0x3ffd78f1d848, list_of_states = {next = 0x3ffd78f1d848, prev = 0x3ffd78f1d858}, layoutrecall_list = {next = 0x3ffd78f1d858, prev = 0x0}, lock_list = {
              next = 0x0, prev = 0x0}, nlm_share_list = {next = 0x0, prev = 0x0}, write_delegated = false, fdeleg_stats = {fds_curr_delegations = 0,
              fds_deleg_type = OPEN_DELEGATE_NONE, fds_delegation_count = 0, fds_recall_count = 0, fds_avg_hold = 0, fds_last_delegation = 0, fds_last_recall = 0,
              fds_num_opens = 0, fds_first_open = 0}, anon_ops = 0}, dir = {junction_export = 0x3ffd78f1d848, export_roots = {next = 0x3ffd78f1d848, prev = 0x3ffd78f1d858},
            exp_root_refcount = 16381}}}, parent = {addr = 0x0, len = 0}, parent_time = 1580009483, first_ck = 0, avl = {t = {root = 0x0,
          cmp_fn = @0x10299918: 0x101fbfa0 <avl_dirent_name_cmpf>, height = -1, first = 0x0, last = 0x0, size = 0}, ck = {root = 0x0,
          cmp_fn = @0x10299928: 0x101fc08c <avl_dirent_ck_cmpf>, height = -1, first = 0x0, last = 0x61, size = 70357874528096}, sorted = {root = 0x3ffd7857a9a0,
          cmp_fn = @0x10299938: 0x101fc140 <avl_dirent_sorted_cmpf>, height = -1, first = 0x0, last = 0x0, size = 0}, collisions = 0}}}} 

Thanks,
Madhu Thorat.