Best guess is that this is related to readdir chunking refcounting.
Pulling all the chunking fixes is likely to fix this. For example, this
one:
commit f0d5b8d4f6dcce4597459c3ebaf36d1e96e4645b
MDCACHE - Fix chunk refcounting in readdir
Also this one:
11e0e375e40658267cbf449afacaa53a136f7097
MDCACHE - Fix race between lru functions for the chunk and the parent
Daniel
On 1/29/20 6:23 AM, Madhu P Punjabi wrote:
Hi All,
A customer using 2.7.6 reported a crash inmdcache_lru_clean(..) when
"entry->sub_handle" had an unexpected address.
It is happening when lru_reap_chunk_impl(..) calls mdcache_put(..) in
the following code:
/ 815 if (entry == parent ||
816 pthread_rwlock_trywrlock(&entry->content_lock)
== 0) {
...
819 if (entry != parent) {
...
823 (void)
atomic_inc_int32_t(&entry->lru.refcnt);
824 }
825
826 /* Dequeue the chunk so it won't show up
anymore */
827 CHUNK_LRU_DQ_SAFE(lru, lq);
828 chunk->chunk_lru.qid = LRU_ENTRY_NONE;
...
839 mdcache_clean_dirent_chunk(chunk);
840 atomic_clear_uint32_t_bits(&entry->mde_flags,
841
MDCACHE_DIR_POPULATED);
842
843 if (entry != parent) {
...
848
PTHREAD_RWLOCK_unlock(&entry->content_lock);
* 849 mdcache_put(entry);*
850 }/
At the time of crash:
/*entries_hiwat = 3000000, entries_used = 3000000, chunks_hiwat =
100000, chunks_used = 100000*/
Has anybody seen this kind of crash ? Any patches that may help to fix
the crash ? We have the customer coredump, so can provide more
information using it.
Backtrace for reference:
/(gdb) bt
#0 0x00003fff955294a4 in .raise () from /lib64/libpthread.so.0
#1 0x000000001005b564 in crash_handler (signo=11, info=0x3ffd73eebee8,
ctx=0x3ffd73eeb170) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_init.c:244
#2 <signal handler called>
#3 0x00000000101cf5fc in mdcache_lru_clean (entry=0x3ffd78f1d540) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:581
#4 0x00000000101d66ec in _mdcache_lru_unref (entry=0x3ffd78f1d540,
flags=0, func=0x10262100 <__func__.21092> "mdcache_put", line=196)
at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:2023
#5 0x00000000101cda88 in mdcache_put (entry=0x3ffd78f1d540) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.h:196
#6 0x00000000101d0f3c in lru_reap_chunk_impl (qid=LRU_ENTRY_L2,
parent=0x3ffee854d080)
at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:849
#7 0x00000000101d1314 in mdcache_get_chunk (parent=0x3ffee854d080,
prev_chunk=0x0, whence=0)
at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:893
#8 0x00000000101f6c78 in mdcache_populate_dir_chunk
(directory=0x3ffee854d080, whence=0, dirent=0x3ffd73eec988,
prev_chunk=0x0, eod_met=0x3ffd73eec990)
at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:2572
#9 0x00000000101f90a4 in mdcache_readdir_chunked
(directory=0x3ffee854d080, whence=0, dir_state=0x3ffd73eecc18,
cb=@0x1028aaa8: 0x10044310 <populate_dirent>, attrmask=122830,
eod_met=0x3ffd73eecd64) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3009
#10 0x00000000101dfeb4 in mdcache_readdir (dir_hdl=0x3ffee854d0b8,
whence=0x3ffd73eeccd0, dir_state=0x3ffd73eecc18, cb=@0x1028aaa8:
0x10044310 <populate_dirent>,
attrmask=122830, eod_met=0x3ffd73eecd64) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:559
#11 0x0000000010044ff4 in fsal_readdir (directory=0x3ffee854d0b8,
cookie=0, nbfound=0x3ffd73eecd60, eod_met=0x3ffd73eecd64, attrmask=122830,
cb=@0x102907c8: 0x100dcfe8 <nfs3_readdirplus_callback>,
opaque=0x3ffd73eecd80) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/fsal_helper.c:1160
#12 0x00000000100dcd64 in nfs3_readdirplus (arg=0x3ff9e8fa79c8,
req=0x3ff9e8fa72c0, res=0x3ff9e8ca4b50)
at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/Protocols/NFS/nfs3_readdirplus.c:310
#13 0x00000000100867f0 in nfs_rpc_process_request
(reqdata=0x3ff9e8fa72c0) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_worker_thread.c:1325
#14 0x0000000010087340 in nfs_rpc_valid_NFS (req=0x3ff9e8fa72c0) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_worker_thread.c:1545
#15 0x00003fff95a0d448 in svc_vc_decode (req=0x3ff9e8fa72c0) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_vc.c:829
#16 0x0000000010070f74 in nfs_rpc_decode_request (xprt=0x3ffcb0eeba00,
xdrs=0x3ff9e811f090)
at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/MainNFSD/nfs_rpc_dispatcher_thread.c:1345
#17 0x00003fff95a0d2a0 in svc_vc_recv (xprt=0x3ffcb0eeba00) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_vc.c:802
#18 0x00003fff95a07884 in svc_rqst_xprt_task (wpe=0x3ffcb0eebc58) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:769
#19 0x00003fff95a07f20 in svc_rqst_epoll_events (sr_rec=0x1002bca2390,
n_events=1) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:941
#20 0x00003fff95a082e4 in svc_rqst_epoll_loop (sr_rec=0x1002bca2390) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:1014
#21 0x00003fff95a08428 in svc_rqst_run_task (wpe=0x1002bca2390) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/svc_rqst.c:1050
#22 0x00003fff95a168fc in work_pool_thread (arg=0x3ffd80001f40) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/libntirpc/src/work_pool.c:181
#23 0x00003fff9551c99c in .start_thread () from /lib64/libpthread.so.0
#24 0x00003fff953c781c in .__clone () from /lib64/libc.so.6/
/(gdb) frame 3
#3 0x00000000101cf5fc in mdcache_lru_clean (entry=0x3ffd78f1d540) at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:581
581 subcall(
(gdb) list
576 LogCrit(COMPONENT_CACHE_INODE_LRU,
577 "Error closing file in cleanup: %s",
578 fsal_err_txt(status));
579 }
580
581 subcall(
582 entry->sub_handle->obj_ops->release(entry->sub_handle)
583 );
584 entry->sub_handle = NULL;
585
*(gdb) p entry->sub_handle
$5 = (struct fsal_obj_handle *) 0x1e489f3800000000 --------> this has
unexpected address, accessing it causing crash*/
/(gdb) frame 6
#6 0x00000000101d0f3c in lru_reap_chunk_impl (qid=LRU_ENTRY_L2,
parent=0x3ffee854d080)
at
/usr/src/debug/nfs-ganesha-2.7.5-ibm053.02/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:849
849 mdcache_put(entry);
(gdb) p entry
$13 = (mdcache_entry_t *) 0x3ffd78f1d540
(gdb) p parent
$14 = (mdcache_entry_t *) 0x3ffee854d080/
/*(gdb) p lru_state
$15 = {entries_hiwat = 3000000, entries_used = 3000000, chunks_hiwat =
100000, chunks_used = 100000, fds_system_imposed = 160000,
fds_hard_limit = 144000, fds_hiwat = 96000,
fds_lowat = 32000, futility = 0, per_lane_work = 50, biggest_window =
64000, prev_fd_count = 38875, prev_time = 1580013456, fd_state = 0}*/
/(gdb) p *entry
$7 = {attr_lock = {__data = {__lock = 16381, __nr_readers = 2024405152,
__readers_wakeup = 16381, __writer_wakeup = 2019009936,
__nr_readers_queued = 0,
__nr_writers_queued = 0, __writer = 0, __shared = 0, __pad1 =
2860448219136, __pad2 = 228761534566, __flags = 805318656},
__size = "\000\000?\375x\251\370\240\000\000?\375xW\245\220",
'\000' <repeats 18 times>,
"\002\232\000\000\000\000\000\000\000\065C@\000f0\000\060\000\n\000\002",
__align = 70357883680928}, obj_handle = {handles = {next =
0x2886290afa5892f8, prev = 0x3d01000001000000}, fs = 0x3013836, fsal =
0x6e0000000074e963,
obj_ops = 0x8400000003013833, obj_lock = {__data = {__lock =
318767104, __nr_readers = 3, __readers_wakeup = 20461166,
__writer_wakeup = 448, __nr_readers_queued = 1,
__nr_writers_queued = 18, __writer = 1146909539, __shared =
1751476595, __pad1 = 7596798812954843694, __pad2 = 4848124998864338967,
__flags = 1146905448},
__size =
"\023\000\000\000\000\000\000\003\001\070\066n\000\000\001\300\000\000\000\001\000\000\000\022D\\XXXXXXXX(a)XXX.XX\000\000\000\000\000\027D\\chab-p",
__align = 1369094286720630787}, type = 1752789805, fsid = {major
= 4995697438821140480, minor = 0}, fileid = 512, state_hdl = 0x5dfc43d5},
sub_handle = 0x1e489f3800000000, attrs = {request_mask =
6677607623807894576, valid_mask = 1236774467, supported = 3, type =
20461166, filesize = 41, fsid = {
major = 9667270145471608893, minor = 932007903232}, acl = 0x0,
fileid = 31387111933, mode = 384, numlinks = 2, owner = 172197, group =
194403, rawdev = {major = 0,
minor = 0}, atime = {tv_sec = 1580013344, tv_nsec = 192799000},
creation = {tv_sec = 0, tv_nsec = 0}, ctime = {tv_sec = 1580013344,
tv_nsec = 197238000}, mtime = {
tv_sec = 1580013344, tv_nsec = 195122884}, chgtime = {tv_sec =
1580013344, tv_nsec = 197238000}, spaceused = 512, change = 1777251344,
generation = 0,
expire_time_attr = 60, fs_locations = 0x0, sec_label = {slai_lfs =
{lfs_lfs = 0, lfs_pi = 0}, slai_data = {slai_data_len = 0, slai_data_val
= 0x0}}}, fh_hk = {node_k = {
left = 0x0, right = 0x0, parent = 70361210201962}, key = {hk =
18326489524089677003, fsal = 0x3fff90ec0000 <GPFS>, kv = {addr = 0x0,
len = 0}}, inavl = false},
mde_flags = 2, attr_time = 1580013344, acl_time = 1580013344,
fs_locations_time = 0, lru = {q = {next = 0x0, prev = 0x0}, qid =
LRU_ENTRY_CLEANUP, refcnt = 0, flags = 3,
lane = 66, cf = 0}, export_list = {next = 0x3ffd78f1d790, prev =
0x3ffd78f1d790}, first_export_id = -1, content_lock = {__data = {__lock
= 0, __nr_readers = 0,
__readers_wakeup = 0, __writer_wakeup = 0, __nr_readers_queued =
0, __nr_writers_queued = 0, __writer = 0, __shared = 0, __pad1 = 0,
__pad2 = 0, __flags = 0},
__size = '\000' <repeats 55 times>, __align = 0}, fsobj = {hdl =
{state_lock = {__data = {__lock = 0, __nr_readers = 0, __readers_wakeup
= 0, __writer_wakeup = 0,
__nr_readers_queued = 0, __nr_writers_queued = 0, __writer =
0, __shared = 0, __pad1 = 0, __pad2 = 0, __flags = 0}, __size = '\000'
<repeats 55 times>, __align = 0},
no_cleanup = false, {file = {obj = 0x3ffd78f1d578, list_of_states
= {next = 0x3ffd78f1d828, prev = 0x3ffd78f1d828}, layoutrecall_list =
{next = 0x3ffd78f1d838,
prev = 0x3ffd78f1d838}, lock_list = {next = 0x3ffd78f1d848,
prev = 0x3ffd78f1d848}, nlm_share_list = {next = 0x3ffd78f1d858, prev =
0x3ffd78f1d858},
write_delegated = false, fdeleg_stats = {fds_curr_delegations
= 0, fds_deleg_type = OPEN_DELEGATE_NONE, fds_delegation_count = 0,
fds_recall_count = 0,
fds_avg_hold = 0, fds_last_delegation = 0, fds_last_recall
= 0, fds_num_opens = 0, fds_first_open = 0}, anon_ops = 0}, dir =
{junction_export = 0x3ffd78f1d578,
export_roots = {next = 0x3ffd78f1d828, prev =
0x3ffd78f1d828}, exp_root_refcount = 16381}}}, fsdir = {chunks = {next =
0x0, prev = 0x0}, detached = {next = 0x0,
prev = 0x0}, spin = 0, detached_count = 0, dhdl = {state_lock =
{__data = {__lock = 0, __nr_readers = 0, __readers_wakeup = 0,
__writer_wakeup = 0,
__nr_readers_queued = 0, __nr_writers_queued = 0, __writer
= 16381, __shared = 2029114744, __pad1 = 70357888391208, __pad2 =
70357888391208, __flags = 16381},
__size = '\000' <repeats 26 times>,
"?\375x\361\325x\000\000?\375x\361\330(\000\000?\375x\361\330(\000\000?\375x\361\330\070",
__align = 0}, no_cleanup = false, {
file = {obj = 0x3ffd78f1d848, list_of_states = {next =
0x3ffd78f1d848, prev = 0x3ffd78f1d858}, layoutrecall_list = {next =
0x3ffd78f1d858, prev = 0x0}, lock_list = {
next = 0x0, prev = 0x0}, nlm_share_list = {next = 0x0,
prev = 0x0}, write_delegated = false, fdeleg_stats =
{fds_curr_delegations = 0,
fds_deleg_type = OPEN_DELEGATE_NONE, fds_delegation_count
= 0, fds_recall_count = 0, fds_avg_hold = 0, fds_last_delegation = 0,
fds_last_recall = 0,
fds_num_opens = 0, fds_first_open = 0}, anon_ops = 0},
dir = {junction_export = 0x3ffd78f1d848, export_roots = {next =
0x3ffd78f1d848, prev = 0x3ffd78f1d858},
exp_root_refcount = 16381}}}, parent = {addr = 0x0, len =
0}, parent_time = 1580009483, first_ck = 0, avl = {t = {root = 0x0,
cmp_fn = @0x10299918: 0x101fbfa0 <avl_dirent_name_cmpf>,
height = -1, first = 0x0, last = 0x0, size = 0}, ck = {root = 0x0,
cmp_fn = @0x10299928: 0x101fc08c <avl_dirent_ck_cmpf>, height
= -1, first = 0x0, last = 0x61, size = 70357874528096}, sorted = {root =
0x3ffd7857a9a0,
cmp_fn = @0x10299938: 0x101fc140 <avl_dirent_sorted_cmpf>,
height = -1, first = 0x0, last = 0x0, size = 0}, collisions = 0}}}} /
Thanks,
Madhu Thorat.