Hello,
We are seeing the following crash with NFS Ganesha 2.7.1 – we crash further in our fsal module’s lookup method trying to use the “name” which has an invalid pointer.
This has the following 3 patches applied to NFS Ganesha 2.7.1
https://github.com/nfs-ganesha/nfs-ganesha/commit/654dd706d22663c6ae6029e0c8c5814fe0d6ff6a
https://github.com/nfs-ganesha/nfs-ganesha/commit/5dc6a70ed42275a4f6772b9802e79f23dc25fa73
The most recent patch to not return dead hash entries:
https://github.com/nfs-ganesha/nfs-ganesha/commit/25320e6544f6c5a045f20c51446f57c9dc036412
The workload:
Concurrent access from multiple threads. 1 thread continuously (in a loop) running python os.walk (i.e., readdir) of the entire filesystem, roughly
~5M files total. 5 more threads are writing a few thousand files each. When the writes complete, a single thread verifies written content, then deletes it. Then the writes repeat again.
This is the same workload that causes our OOM issue.
https://lists.nfs-ganesha.org/archives/list/devel@lists.nfs-ganesha.org/thread/A6BSM65DZKYRJY7QJL5ECGRPLTRCA2F2/
#5
0x00007f6acd6a1dca in foo_lookup (parent=0x6fe2e420, name=0xa0 <Address 0xa0 out of bounds>,
handle=0x7f6abf3255a8, attrs_out=0x7f6abf3254a0) at /opt/src/src/handle.c:364
#6
0x000000000053a7a6 in mdc_lookup_uncached (mdc_parent=0x291a4ba0, name=0xa0 <Address 0xa0 out of bounds>,
new_entry=0x7f6abf325728, attrs_out=0x0) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:1293
#7
0x0000000000541344 in mdcache_readdir_chunked (directory=0x291a4ba0, whence=0, dir_state=0x7f6abf325900,
cb=0x43217c <populate_dirent>, attrmask=122830, eod_met=0x7f6abf325ffb)
at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3065
#8
0x000000000052e8c3 in mdcache_readdir (dir_hdl=0x291a4bd8, whence=0x7f6abf3258e0, dir_state=0x7f6abf325900,
cb=0x43217c <populate_dirent>, attrmask=122830, eod_met=0x7f6abf325ffb)
at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:559
#9
0x0000000000432a76 in fsal_readdir (directory=0x291a4bd8, cookie=0, nbfound=0x7f6abf325ffc,
eod_met=0x7f6abf325ffb, attrmask=122830, cb=0x492018 <nfs3_readdirplus_callback>, opaque=0x7f6abf325fb0)
at /src/src/FSAL/fsal_helper.c:1158
#10 0x0000000000491e71 in nfs3_readdirplus (arg=0x10704818, req=0x10704110, res=0x3103f090)
at /src/src/Protocols/NFS/nfs3_readdirplus.c:310
#11 0x00000000004574d1 in nfs_rpc_process_request (reqdata=0x10704110) at /src/src/MainNFSD/nfs_worker_thread.c:1329
#12 0x0000000000457c90 in nfs_rpc_valid_NFS (req=0x10704110) at /src/src/MainNFSD/nfs_worker_thread.c:1549
#13 0x00007f6ad115ce75 in svc_vc_decode (req=0x10704110) at /src/src/libntirpc/src/svc_vc.c:825
---Type <return> to continue, or q <return> to quit---
#14 0x000000000044a688 in nfs_rpc_decode_request (xprt=0x18390200, xdrs=0x174943c0)
at /src/src/MainNFSD/nfs_rpc_dispatcher_thread.c:1341
#15 0x00007f6ad115cd86 in svc_vc_recv (xprt=0x18390200) at /src/src/libntirpc/src/svc_vc.c:798
#16 0x00007f6ad11594d3 in svc_rqst_xprt_task (wpe=0x18390418) at /src/src/libntirpc/src/svc_rqst.c:767
#17 0x00007f6ad115994d in svc_rqst_epoll_events (sr_rec=0x2779260, n_events=1)
at /src/src/libntirpc/src/svc_rqst.c:939
#18 0x00007f6ad1159be2 in svc_rqst_epoll_loop (sr_rec=0x2779260) at /src/src/libntirpc/src/svc_rqst.c:1012
#19 0x00007f6ad1159c95 in svc_rqst_run_task (wpe=0x2779260) at /src/src/libntirpc/src/svc_rqst.c:1048
#20 0x00007f6ad11625f6 in work_pool_thread (arg=0x3b5d170) at /src/src/libntirpc/src/work_pool.c:181
#21 0x00007f6ad0169de5 in start_thread () from /lib64/libpthread.so.0
#22 0x00007f6acfa71bad in clone () from /lib64/libc.so.6
(gdb) select-frame 7
(gdb) info locals
status = {major = ERR_FSAL_INVAL, minor = 0}
cb_result = DIR_CONTINUE
entry = 0x0
attrs = {request_mask = 122830, valid_mask = 1433550, supported = 1433582, type = REGULAR_FILE, filesize = 1024,
fsid = {major = 0, minor = 0}, acl = 0x0, fileid = 47680710, mode = 438, numlinks = 1, owner = 65534,
group = 65534, rawdev = {major = 0, minor = 0}, atime = {tv_sec = 1548784955, tv_nsec = 582000000}, creation = {
tv_sec = 0, tv_nsec = 0}, ctime = {tv_sec = 1548784955, tv_nsec = 582000000}, mtime = {tv_sec = 1548784955,
tv_nsec = 582000000}, chgtime = {tv_sec = 1548784955, tv_nsec = 582000000}, spaceused = 1024,
change = 1548784955582, generation = 0, expire_time_attr = 60, fs_locations = 0x0}
dirent = 0x5296230
has_write = true
set_first_ck = false
next_ck = 2419507
look_ck = 2419507
chunk = 0x156d7240
first_pass = true
eod = false
reload_chunk = false
__func__ = "mdcache_readdir_chunked"
__PRETTY_FUNCTION__ = "mdcache_readdir_chunked"
(gdb)
status = {major = ERR_FSAL_INVAL, minor = 0}
cb_result = DIR_CONTINUE
entry = 0x0
attrs = {request_mask = 122830, valid_mask = 1433550, supported = 1433582, type = REGULAR_FILE, filesize = 1024,
fsid = {major = 0, minor = 0}, acl = 0x0, fileid = 47680710, mode = 438, numlinks = 1, owner = 65534,
group = 65534, rawdev = {major = 0, minor = 0}, atime = {tv_sec = 1548784955, tv_nsec = 582000000}, creation = {
tv_sec = 0, tv_nsec = 0}, ctime = {tv_sec = 1548784955, tv_nsec = 582000000}, mtime = {tv_sec = 1548784955,
tv_nsec = 582000000}, chgtime = {tv_sec = 1548784955, tv_nsec = 582000000}, spaceused = 1024,
change = 1548784955582, generation = 0, expire_time_attr = 60, fs_locations = 0x0}
dirent = 0x5296230
has_write = true
set_first_ck = false
next_ck = 2419507
look_ck = 2419507
chunk = 0x156d7240
first_pass = true
eod = false
reload_chunk = false
__func__ = "mdcache_readdir_chunked"
__PRETTY_FUNCTION__ = "mdcache_readdir_chunked"
(gdb) print *dirent
$1 = {chunk_list = {next = 0x0, prev = 0xa1}, chunk = 0x3018770, node_name = {
left = 0x7f6acfd39848 <main_arena+232>, right = 0x0, parent = 0}, node_ck = {left = 0x5296258, right = 0x0,
parent = 0}, node_sorted = {left = 0xffffffff, right = 0x0, parent = 0}, ck = 0, eod = false, namehash = 8192,
ckey = {hk = 1056768, fsal = 0x0, kv = {addr = 0x0, len = 134650068}}, flags = 0,
name = 0xa0 <Address 0xa0 out of bounds>, name_buffer = 0x52962d8 " "}
(gdb) print dirent.name
$3 = 0xa0 <Address 0xa0 out of bounds>
(gdb) print *0x52962d8
$4 = 32
(gdb) print dirent.name_buffer
$5 = 0x52962d8 " "
(gdb) print *dirent.name_buffer
$6 = 32 ' '
(gdb) print *chunk
$7 = {chunks = {next = 0x291a4e28, prev = 0x291a4e28}, dirents = {next = 0x2ed8c060, prev = 0x48ff2870},
parent = 0x291a4ba0, chunk_lru = {q = {next = 0x7e1c00 <CHUNK_LRU+1792>, prev = 0x4d6f5c88}, qid = LRU_ENTRY_L1,
refcnt = 0, flags = 0, lane = 8, cf = 0}, reload_ck = 0, next_ck = 0, num_entries = 2500}