We see the below crash in the mdcache_lru_unref path with a long running test using a Windows NFS client.
We’re running ganesha v2.7.1 with 2 additional commits from -next that seemed possibly relevant:
https://github.com/nfs-ganesha/nfs-ganesha/commit/654dd706d22663c6ae6029e0c8c5814fe0d6ff6a
https://github.com/nfs-ganesha/nfs-ganesha/commit/5dc6a70ed42275a4f6772b9802e79f23dc25fa73
Client:
Windows 2012 NFSv3 [UID=-2, GID=-2, rsize=1048576, wsize=1048576, mount=hard, timeout=1.6, retry=1, locking=no, fileaccess=755, lang=ANSI, casesensitive=no, sec=sys]
Note use hard mount and nolock which are not default for Windows.
Workload:
Concurrent access from multiple threads. 1 thread continuously (in a loop) running python os.walk (i.e., readdir) of the entire filesystem, roughly ~5M files total. 5 more threads are writing a few thousand
files each. When the writes complete, a single thread verifies written content, then deletes it. Then the writes repeat again.
Test requires 24-48 hours until failure but failure is repeatable if you are sufficiently patient.
We do not see this issue with a Linux NFS client (v3 or v4) with a similar workload.
We seem to have re-entered _mdcache_lru_unref with the same entry.
Apologies if my email client destroys the formatting below.
(gdb) bt
#0 0x00007f1520efe277 in raise () from /lib64/libc.so.6
#1 0x00007f1520eff968 in abort () from /lib64/libc.so.6
#2 0x0000000000536388 in cih_remove_checked (entry=0x22a9d1a0) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_hash.h:394
#3 0x0000000000537248 in mdc_clean_entry (entry=0x22a9d1a0) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:264
#4 0x000000000052480d in mdcache_lru_clean (entry=0x22a9d1a0) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:591
#5 0x000000000052963c in _mdcache_lru_unref (entry=0x22a9d1a0, flags=0, func=0x596b90 <__func__.23067> "cih_remove_checked", line=406)
at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:1989
#6 0x000000000053642d in cih_remove_checked (entry=0x22a9d1a0) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_hash.h:406
#7 0x0000000000537248 in mdc_clean_entry (entry=0x22a9d1a0) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:264
#8 0x000000000052480d in mdcache_lru_clean (entry=0x22a9d1a0) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:591
#9 0x000000000052963c in _mdcache_lru_unref (entry=0x22a9d1a0, flags=0, func=0x5941df <__func__.22972> "mdcache_put", line=196)
at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:1989
#10 0x000000000052cb54 in mdcache_put (entry=0x22a9d1a0) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.h:196
#11 0x0000000000530bca in mdcache_put_ref (obj_hdl=0x22a9d1d8) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:1280
#12 0x00000000004327e5 in populate_dirent (name=0x43f30278 "random.267", obj=0x22a9d1d8, attrs=0x22a9d268, dir_state=0x7f15176e9af0, cookie=3786442)
at /src/src/FSAL/fsal_helper.c:1064
#13 0x0000000000541a25 in mdcache_readdir_chunked (directory=0xcf7c7c0, whence=3786313, dir_state=0x7f15176e9af0, cb=0x43217c <populate_dirent>,
attrmask=0, eod_met=0x7f15176e9feb) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3146
#14 0x000000000052e8c3 in mdcache_readdir (dir_hdl=0xcf7c7f8, whence=0x7f15176e9ad0, dir_state=0x7f15176e9af0, cb=0x43217c <populate_dirent>, attrmask=0,
eod_met=0x7f15176e9feb) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:559
#15 0x0000000000432a76 in fsal_readdir (directory=0xcf7c7f8, cookie=3786313, nbfound=0x7f15176e9fec, eod_met=0x7f15176e9feb, attrmask=0,
cb=0x4912a2 <nfs3_readdir_callback>, opaque=0x7f15176e9fa0) at /src/src/FSAL/fsal_helper.c:1158
#16 0x000000000049108a in nfs3_readdir (arg=0x996f788, req=0x996f080, res=0x3f3c6a30) at /src/src/Protocols/NFS/nfs3_readdir.c:289
#17 0x00000000004574d1 in nfs_rpc_process_request (reqdata=0x996f080) at /src/src/MainNFSD/nfs_worker_thread.c:1329
#18 0x0000000000457c90 in nfs_rpc_valid_NFS (req=0x996f080) at /src/src/MainNFSD/nfs_worker_thread.c:1549
#19 0x00007f15226b1e75 in svc_vc_decode (req=0x996f080) at /src/src/libntirpc/src/svc_vc.c:825
#20 0x000000000044a688 in nfs_rpc_decode_request (xprt=0x4f4e77f0, xdrs=0xd57ef60) at /src/src/MainNFSD/nfs_rpc_dispatcher_thread.c:1341
#21 0x00007f15226b1d86 in svc_vc_recv (xprt=0x4f4e77f0) at /src/src/libntirpc/src/svc_vc.c:798
#22 0x00007f15226ae4d3 in svc_rqst_xprt_task (wpe=0x4f4e7a08) at /src/src/libntirpc/src/svc_rqst.c:767
#23 0x00007f15226ae94d in svc_rqst_epoll_events (sr_rec=0x101b440, n_events=1) at /src/src/libntirpc/src/svc_rqst.c:939
#24 0x00007f15226aebe2 in svc_rqst_epoll_loop (sr_rec=0x101b440) at /src/src/libntirpc/src/svc_rqst.c:1012
#25 0x00007f15226aec95 in svc_rqst_run_task (wpe=0x101b440) at /src/src/libntirpc/src/svc_rqst.c:1048
(gdb) print *entry
$2 = {attr_lock = {__data = {__lock = 0, __nr_readers = 0, __readers_wakeup = 0, __writer_wakeup = 0, __nr_readers_queued = 0, __nr_writers_queued = 0,
__writer = 0, __shared = 0, __pad1 = 0, __pad2 = 0, __flags = 0}, __size = '\000' <repeats 55 times>, __align = 0}, obj_handle = {handles = {
next = 0x12a5ac78, prev = 0x12de5418}, fs = 0x0, fsal = 0x7d3340 <MDCACHE>, obj_ops = 0x7d3490 <MDCACHE+336>, obj_lock = {__data = {__lock = 0,
__nr_readers = 0, __readers_wakeup = 0, __writer_wakeup = 0, __nr_readers_queued = 0, __nr_writers_queued = 0, __writer = 0, __shared = 0,
__pad1 = 0, __pad2 = 0, __flags = 0}, __size = '\000' <repeats 55 times>, __align = 0}, type = REGULAR_FILE, fsid = {major = 0, minor = 0},
fileid = 2454576, state_hdl = 0x22a9d428}, sub_handle = 0x0, attrs = {request_mask = 1433550, valid_mask = 1433550, supported = 1433582,
type = REGULAR_FILE, filesize = 1024, fsid = {major = 0, minor = 0}, acl = 0x0, fileid = 2454576, mode = 438, numlinks = 1, owner = 65534,
group = 65534, rawdev = {major = 0, minor = 0}, atime = {tv_sec = 1547875446, tv_nsec = 112000000}, creation = {tv_sec = 0, tv_nsec = 0}, ctime = {
tv_sec = 1547875446, tv_nsec = 112000000}, mtime = {tv_sec = 1547875446, tv_nsec = 112000000}, chgtime = {tv_sec = 1547875446, tv_nsec = 112000000},
spaceused = 1024, change = 1547875446112, generation = 0, expire_time_attr = 60, fs_locations = 0x0}, fh_hk = {node_k = {left = 0x0, right = 0x0,
parent = 267421682}, key = {hk = 10676371249170983334, fsal = 0x7f15186d45e0 <FOO>, kv = {addr = 0x4761f300, len = 10}}, inavl = false},
mde_flags = 1, attr_time = 1547998073, acl_time = 0, fs_locations_time = 0, lru = {q = {next = 0x0, prev = 0x0}, qid = LRU_ENTRY_L1, refcnt = 0,
flags = 0, lane = 10, cf = 0}, export_list = {next = 0x22a9d3d8, prev = 0x22a9d3d8}, first_export_id = -1, content_lock = {__data = {__lock = 0,
__nr_readers = 0, __readers_wakeup = 0, __writer_wakeup = 0, __nr_readers_queued = 0, __nr_writers_queued = 0, __writer = 0, __shared = 0,
__pad1 = 0, __pad2 = 0, __flags = 0}, __size = '\000' <repeats 55 times>, __align = 0}, fsobj = {hdl = {state_lock = {__data = {__lock = 0,
__nr_readers = 0, __readers_wakeup = 0, __writer_wakeup = 0, __nr_readers_queued = 0, __nr_writers_queued = 0, __writer = 0, __shared = 0,
__pad1 = 0, __pad2 = 0, __flags = 0}, __size = '\000' <repeats 55 times>, __align = 0}, no_cleanup = false, {file = {obj = 0x22a9d1d8,
list_of_states = {next = 0x22a9d470, prev = 0x22a9d470}, layoutrecall_list = {next = 0x22a9d480, prev = 0x22a9d480}, lock_list = {
next = 0x22a9d490, prev = 0x22a9d490}, nlm_share_list = {next = 0x22a9d4a0, prev = 0x22a9d4a0}, write_delegated = false, fdeleg_stats = {
fds_curr_delegations = 0, fds_deleg_type = OPEN_DELEGATE_NONE, fds_delegation_count = 0, fds_recall_count = 0, fds_avg_hold = 0,
fds_last_delegation = 0, fds_last_recall = 0, fds_num_opens = 0, fds_first_open = 0}, anon_ops = 0}, dir = {junction_export = 0x22a9d1d8,
export_roots = {next = 0x22a9d470, prev = 0x22a9d470}, exp_root_refcount = 581555328}}}, fsdir = {chunks = {next = 0x0, prev = 0x0}, detached = {
next = 0x0, prev = 0x0}, spin = 0, detached_count = 0, dhdl = {state_lock = {__data = {__lock = 0, __nr_readers = 0, __readers_wakeup = 0,
__writer_wakeup = 0, __nr_readers_queued = 0, __nr_writers_queued = 0, __writer = 581554648, __shared = 0, __pad1 = 581555312,
__pad2 = 581555312, __flags = 581555328},
__size = '\000' <repeats 24 times>, "\330ѩ\"\000\000\000\000p\324\251\"\000\000\000\000p\324\251\"\000\000\000\000\200\324\251\"\000\000\000",
__align = 0}, no_cleanup = 128, {file = {obj = 0x22a9d490, list_of_states = {next = 0x22a9d490, prev = 0x22a9d4a0}, layoutrecall_list = {
next = 0x22a9d4a0, prev = 0x0}, lock_list = {next = 0x0, prev = 0x0}, nlm_share_list = {next = 0x0, prev = 0x0}, write_delegated = false,
fdeleg_stats = {fds_curr_delegations = 0, fds_deleg_type = OPEN_DELEGATE_NONE, fds_delegation_count = 0, fds_recall_count = 0,
fds_avg_hold = 0, fds_last_delegation = 0, fds_last_recall = 0, fds_num_opens = 0, fds_first_open = 0}, anon_ops = 0}, dir = {
junction_export = 0x22a9d490, export_roots = {next = 0x22a9d490, prev = 0x22a9d4a0}, exp_root_refcount = 581555360}}}, parent = {addr = 0x0,
len = 0}, first_ck = 0, avl = {t = {root = 0x0, cmp_fn = 0x0, height = 0, first = 0x0, last = 0x0, size = 0}, ck = {root = 0x0, cmp_fn = 0x0,
height = 0, first = 0x0, last = 0x0, size = 0}, sorted = {root = 0x0, cmp_fn = 0x0, height = 0, first = 0x0, last = 0x0, size = 0},
collisions = 0}}}}