Daniel,

 

Core in mdcache when mdcache_lru_unref_chunk.  Ref count is 0.

 

I started these tests 10 days ago with all the mdcache patches including the one moving the unref_chunk in readdir_chunked to inside the lock.  The test continues to readdir over 5 million entries while writing/reading/deleting content.  I have 2 test setups – both using a Windows client that hit this core.

 

Let me know if there is additional information you need from the cores.

 

Thanks,

Vandana

 

[Thread debugging using libthread_db enabled]

Using host libthread_db library "/lib64/libthread_db.so.1".

Core was generated by `bin/ganesha.nfsd -f etc/ganesha/ganesha.conf -p var/run/ganesha.pid -F'.

Program terminated with signal 11, Segmentation fault.

#0  0x00007fbd14d39c40 in pthread_mutex_lock () from /lib64/libpthread.so.0

Missing separate debuginfos, use: debuginfo-install sgw-nfs-ganesha-2.0.93.0-1.x86_64

(gdb) bt

#0  0x00007fbd14d39c40 in pthread_mutex_lock () from /lib64/libpthread.so.0

#1  0x000000000052af4a in _mdcache_lru_unref_chunk (chunk=0x3836d1a0, func=0x598a00 <__func__.23678> "mdcache_readdir_chunked", line=3133)

    at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_lru.c:2058

#2  0x0000000000542d84 in mdcache_readdir_chunked (directory=0x664fa280, whence=37935356, dir_state=0x7fbd0a3dfaf0, 

    cb=0x4323ed <populate_dirent>, attrmask=0, eod_met=0x7fbd0a3dffeb) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3133

#3  0x000000000053054c in mdcache_readdir (dir_hdl=0x664fa2b8, whence=0x7fbd0a3dfad0, dir_state=0x7fbd0a3dfaf0, cb=0x4323ed <populate_dirent>, 

    attrmask=0, eod_met=0x7fbd0a3dffeb) at /src/src/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:559

#4  0x0000000000432d14 in fsal_readdir (directory=0x664fa2b8, cookie=37935356, nbfound=0x7fbd0a3dffec, eod_met=0x7fbd0a3dffeb, attrmask=0, 

    cb=0x491d35 <nfs3_readdir_callback>, opaque=0x7fbd0a3dffa0) at /src/src/FSAL/fsal_helper.c:1164

#5  0x0000000000491b1d in nfs3_readdir (arg=0xa8fc5d48, req=0xa8fc5640, res=0xa32fbb30) at /src/src/Protocols/NFS/nfs3_readdir.c:289

#6  0x0000000000457c16 in nfs_rpc_process_request (reqdata=0xa8fc5640) at /src/src/MainNFSD/nfs_worker_thread.c:1328

#7  0x00000000004583d5 in nfs_rpc_valid_NFS (req=0xa8fc5640) at /src/src/MainNFSD/nfs_worker_thread.c:1548

#8  0x00007fbd15d14034 in svc_vc_decode (req=0xa8fc5640) at /src/src/libntirpc/src/svc_vc.c:829

#9  0x000000000044adc5 in nfs_rpc_decode_request (xprt=0x7ae54c90, xdrs=0x36434390) at /src/src/MainNFSD/nfs_rpc_dispatcher_thread.c:1345

#10 0x00007fbd15d13f45 in svc_vc_recv (xprt=0x7ae54c90) at /src/src/libntirpc/src/svc_vc.c:802

#11 0x00007fbd15d10689 in svc_rqst_xprt_task (wpe=0x7ae54ea8) at /src/src/libntirpc/src/svc_rqst.c:769

#12 0x00007fbd15d10ae6 in svc_rqst_epoll_events (sr_rec=0x1acd4d0, n_events=1) at /src/src/libntirpc/src/svc_rqst.c:941

#13 0x00007fbd15d10d7b in svc_rqst_epoll_loop (sr_rec=0x1acd4d0) at /src/src/libntirpc/src/svc_rqst.c:1014

#14 0x00007fbd15d10e2e in svc_rqst_run_task (wpe=0x1acd4d0) at /src/src/libntirpc/src/svc_rqst.c:1050

#15 0x00007fbd15d197f6 in work_pool_thread (arg=0x75491e70) at /src/src/libntirpc/src/work_pool.c:181

#16 0x00007fbd14d37de5 in start_thread () from /lib64/libpthread.so.0

#17 0x00007fbd1463ef1d in clone () from /lib64/libc.so.6

(gdb) select-frame 1

(gdb) info locals

rc = 32701

refcnt = 171833376

lane = 268435456

qlane = 0xe007e24e0

__func__ = "_mdcache_lru_unref_chunk"

(gdb) info args

chunk = 0x3836d1a0

func = 0x598a00 <__func__.23678> "mdcache_readdir_chunked"

line = 3133

(gdb) print *chunk

$1 = {chunks = {next = 0xed43cd15, prev = 0xa386010002000000}, dirents = {next = 0x300000003000000, prev = 0x1c00000001000000}, 

  parent = 0x7000000aaaaaaaa, chunk_lru = {q = {next = 0x6e776f6e6b6e75, prev = 0xfefffffffeffffff}, qid = LRU_ENTRY_NONE, refcnt = 0, 

    flags = 0, lane = 268435456, cf = 16777283}, reload_ck = 1099511627778, next_ck = 7236828793636126720, num_entries = 841903471}

(gdb) select-frame 2

(gdb) info locals

status = {major = ERR_FSAL_NOENT, minor = 0}

cb_result = DIR_CONTINUE

entry = 0x0

attrs = {request_mask = 0, valid_mask = 1433550, supported = 1433582, type = REGULAR_FILE, filesize = 51682, fsid = {major = 0, minor = 0}, 

  acl = 0x0, fileid = 37935372, mode = 493, numlinks = 1, owner = 4294967294, group = 4294967294, rawdev = {major = 0, minor = 0}, atime = {

    tv_sec = 1561811134795000, tv_nsec = 0}, creation = {tv_sec = 0, tv_nsec = 0}, ctime = {tv_sec = 1561811134795000, tv_nsec = 0}, mtime = {

    tv_sec = 1561795757, tv_nsec = 842000000}, chgtime = {tv_sec = 1561811134795000, tv_nsec = 0}, spaceused = 51682, 

  change = 1561811134795000000, generation = 0, expire_time_attr = 60, fs_locations = 0x0, sec_label = {slai_lfs = {lfs_lfs = 0, lfs_pi = 0}, 

    slai_data = {slai_data_len = 0, slai_data_val = 0x0}}}

dirent = 0xa57f3030

has_write = true

set_first_ck = false

next_ck = 37935375

look_ck = 37935377

chunk = 0x3836d1a0

first_pass = true

eod = false

reload_chunk = false

__func__ = "mdcache_readdir_chunked"

__PRETTY_FUNCTION__ = "mdcache_readdir_chunked"

(gdb) info args

directory = 0x664fa280

whence = 37935356

dir_state = 0x7fbd0a3dfaf0

cb = 0x4323ed <populate_dirent>

attrmask = 0

eod_met = 0x7fbd0a3dffeb

(gdb)