I have seen this crash multiple times after applying the memory fix (was seeing it before
the memory fix also)
One thing that is consistent is right before the crash I see that our fsal returns an
error from readdir. Error 151 ERR_FSAL_STALE.
I have full logs with COMPONENT_INODE_CACHE and COMPONENT_NFS_READDIR enabled for full
debug.
20 minutes of log is about 50 MB compressed.
2.5 minutes of log is about 7.8MB compressed.
Please let me know if you would like to see the logs and where I can upload it.
Is there a potential race condition between the following paths:
Path1:
mdcache_populate_dir_chunk calls sub_handle->readdir() which calls lru_remove_chunk
when readdir returns an error.
lru_remove_chunk -> lru_clean_chunk -> mdcache_clean_dirent_chunk where it is
cleaning/freeing the dirents from the chunk
Path2:
mdcache_readdir_chunked -> the for loop that gets the dirents for the chunk.
dirent = glist_next_entry(&chunk->dirents,
mdcache_dir_entry_t,
chunk_list,
&dirent->chunk_list)
The dirent from Path2 seems to be corrupt or free when we core.
Backtrace is similar to that reported earlier, except here the dirent pointer is bad.
Reproducible.
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `bin/ganesha.nfsd -f etc/ganesha/ganesha.conf -p var/run/ganesha.pid
-F'.
Program terminated with signal 11, Segmentation fault.
#0 0x0000000000540e42 in mdcache_readdir_chunked (directory=0x80cb4d0, whence=0,
dir_state=0x7fa68be22900, cb=0x43217c <populate_dirent>,
attrmask=122830, eod_met=0x7fa68be22ffb) at
/src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:2993
2993 /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c: No such file or
directory.
Missing separate debuginfos, use: debuginfo-install sgw-nfs-ganesha-2.0.22.0-1.x86_64
(gdb) bt
#0 0x0000000000540e42 in mdcache_readdir_chunked (directory=0x80cb4d0, whence=0,
dir_state=0x7fa68be22900, cb=0x43217c <populate_dirent>,
attrmask=122830, eod_met=0x7fa68be22ffb) at
/src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:2993
#1 0x000000000052e8c3 in mdcache_readdir (dir_hdl=0x80cb508, whence=0x7fa68be228e0,
dir_state=0x7fa68be22900,
cb=0x43217c <populate_dirent>, attrmask=122830, eod_met=0x7fa68be22ffb)
at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:559
#2 0x0000000000432a76 in fsal_readdir (directory=0x80cb508, cookie=0,
nbfound=0x7fa68be22ffc, eod_met=0x7fa68be22ffb, attrmask=122830,
cb=0x492018 <nfs3_readdirplus_callback>, opaque=0x7fa68be22fb0) at
/src/src/FSAL/fsal_helper.c:1158
#3 0x0000000000491e71 in nfs3_readdirplus (arg=0x37200f38, req=0x37200830,
res=0x38040410) at /src/src/Protocols/NFS/nfs3_readdirplus.c:310
#4 0x00000000004574d1 in nfs_rpc_process_request (reqdata=0x37200830) at
/src/src/MainNFSD/nfs_worker_thread.c:1329
#5 0x0000000000457c90 in nfs_rpc_valid_NFS (req=0x37200830) at
/src/src/MainNFSD/nfs_worker_thread.c:1549
#6 0x00007fa693fdbe75 in svc_vc_decode (req=0x37200830) at
/src/src/libntirpc/src/svc_vc.c:825
#7 0x000000000044a688 in nfs_rpc_decode_request (xprt=0xaa7ce0, xdrs=0x39c31170) at
/src/src/MainNFSD/nfs_rpc_dispatcher_thread.c:1341
#8 0x00007fa693fdbd86 in svc_vc_recv (xprt=0xaa7ce0) at
/src/src/libntirpc/src/svc_vc.c:798
#9 0x00007fa693fd84d3 in svc_rqst_xprt_task (wpe=0xaa7ef8) at
/src/src/libntirpc/src/svc_rqst.c:767
#10 0x00007fa693fd894d in svc_rqst_epoll_events (sr_rec=0xa88260, n_events=1) at
/src/src/libntirpc/src/svc_rqst.c:939
#11 0x00007fa693fd8be2 in svc_rqst_epoll_loop (sr_rec=0xa88260) at
/src/src/libntirpc/src/svc_rqst.c:1012
#12 0x00007fa693fd8c95 in svc_rqst_run_task (wpe=0xa88260) at
/src/src/libntirpc/src/svc_rqst.c:1048
#13 0x00007fa693fe15f6 in work_pool_thread (arg=0x679a6bd0) at
/src/src/libntirpc/src/work_pool.c:181
#14 0x00007fa692fe8de5 in start_thread () from /lib64/libpthread.so.0
#15 0x00007fa6928f0bad in clone () from /lib64/libc.so.6
(gdb) print dirent
$1 = (mdcache_dir_entry_t *) 0x594f20325e0a0d33
(gdb) print *dirent
Cannot access memory at address 0x594f20325e0a0d33
(gdb) info locals
status = {major = ERR_FSAL_NO_ERROR, minor = 0}
cb_result = DIR_CONTINUE
entry = 0x0
attrs = {request_mask = 122830, valid_mask = 1433550, supported = 1433582, type =
REGULAR_FILE, filesize = 1024, fsid = {major = 0,
minor = 0}, acl = 0x0, fileid = 12545799, mode = 438, numlinks = 1, owner = 65534,
group = 65534, rawdev = {major = 0, minor = 0},
atime = {tv_sec = 1549408822, tv_nsec = 147000000}, creation = {tv_sec = 0, tv_nsec =
0}, ctime = {tv_sec = 1549408822,
tv_nsec = 147000000}, mtime = {tv_sec = 1549408822, tv_nsec = 147000000}, chgtime =
{tv_sec = 1549408822, tv_nsec = 147000000},
spaceused = 1024, change = 1549408822147, generation = 0, expire_time_attr = 60,
fs_locations = 0x0}
dirent = 0x594f20325e0a0d33
has_write = true
set_first_ck = false
next_ck = 1830007
look_ck = 1830007
chunk = 0x6f70300
first_pass = true
eod = false
reload_chunk = false
__func__ = "mdcache_readdir_chunked"
__PRETTY_FUNCTION__ = "mdcache_readdir_chunked"
(gdb) info args
directory = 0x80cb4d0
whence = 0
dir_state = 0x7fa68be22900
cb = 0x43217c <populate_dirent>
attrmask = 122830
eod_met = 0x7fa68be22ffb
(gdb)
Different Backtrace -- with name being bad
#5 0x00007fbb08ab9c97 in foo_lookup (parent=0x38001a80, name=0xb707a0 <Address
0xb707a0 out of bounds>, handle=0x7fbb071045a8,
attrs_out=0x7fbb071044a0) at /opt/src/src/handle.c:348
#6 0x000000000053a7a6 in mdc_lookup_uncached (mdc_parent=0x20bb8e00, name=0xb707a0
<Address 0xb707a0 out of bounds>,
new_entry=0x7fbb07104728, attrs_out=0x0) at
/src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:1293
#7 0x0000000000541344 in mdcache_readdir_chunked (directory=0x20bb8e00, whence=0,
dir_state=0x7fbb07104900, cb=0x43217c <populate_dirent>,
attrmask=122830, eod_met=0x7fbb07104ffb) at
/src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3065
#8 0x000000000052e8c3 in mdcache_readdir (dir_hdl=0x20bb8e38, whence=0x7fbb071048e0,
dir_state=0x7fbb07104900,
cb=0x43217c <populate_dirent>, attrmask=122830, eod_met=0x7fbb07104ffb)
at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:559
#9 0x0000000000432a76 in fsal_readdir (directory=0x20bb8e38, cookie=0,
nbfound=0x7fbb07104ffc, eod_met=0x7fbb07104ffb, attrmask=122830,
cb=0x492018 <nfs3_readdirplus_callback>, opaque=0x7fbb07104fb0) at
/src/src/FSAL/fsal_helper.c:1158
#10 0x0000000000491e71 in nfs3_readdirplus (arg=0x1acd5b68, req=0x1acd5460,
res=0x3f538be0) at /src/src/Protocols/NFS/nfs3_readdirplus.c:310
#11 0x00000000004574d1 in nfs_rpc_process_request (reqdata=0x1acd5460) at
/src/src/MainNFSD/nfs_worker_thread.c:1329