So, I don't think the session should be able to be freed between 372 and
373, because the hashtable latch for ht_session_id is held.
nfs41_Session_Del, which should release the last session ref, uses
HashTable_Del(), which also takes the latch, so it should wait for the
latch to be released on line 375, which is after the refcount is
incremented. So this sequence should be safe.
Unfortunately, this means there may be a refcount bug somewhere, where
we're releasing an extra ref.
Daniel
On 11/24/22 07:01, Sagar Singh wrote:
We Have seen this core dump twice on our lab environment, and not
sure
about the root cause
(gdb) bt
#0 0x00007f2571e9f4fb in raise () from /lib64/libpthread.so.0
#1 0x00007f25737ab337 in crash_handler (signo=6, info=0x7f254e2c6970,
ctx=0x7f254e2c6840) at
/usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_init.c:256
#2 <signal handler called>
#3 0x00007f25716d6387 in raise () from /lib64/libc.so.6
#4 0x00007f25716d7a78 in abort () from /lib64/libc.so.6
#5 0x00007f2573814227 in _dec_session_ref (session=0x7f254a412e00,
func=0x7f2573922c70 <__func__.22740> "nfs4_op_destroy_session", line=97)
at /usr/src/debug/nfs-ganesha-4.0.2/SAL/nfs41_session_id.c:263
#6 0x00007f257386714b in nfs4_op_destroy_session (op=0x7f2556c380a0,
data=0x7f255302c800, resp=0x7f2556c38000)
at
/usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_op_destroy_session.c:97
#7 0x00007f257385fa43 in process_one_op (data=0x7f255302c800,
status=0x7f254e2c787c) at
/usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_Compound.c:924
#8 0x00007f2573860acb in nfs4_Compound (arg=0x7f219c59d880,
req=0x7f219c59d000, res=0x7f254a591100)
at /usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_Compound.c:1339
#9 0x00007f25737a5f80 in nfs_rpc_process_request
(reqdata=0x7f219c59d000, retry=false)
at /usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_worker_thread.c:2055
#10 0x00007f25737a65f9 in nfs_rpc_valid_NFS (req=0x7f219c59d000) at
/usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_worker_thread.c:2293
#11 0x00007f2573b7ead6 in svc_vc_decode (req=0x7f219c59d000) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_vc.c:1126
#12 0x00007f2573b79c59 in svc_request (xprt=0x7f2558813000,
xdrs=0x7f24e63bbe40) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1229
#13 0x00007f2573b7e9db in svc_vc_recv (xprt=0x7f2558813000) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_vc.c:1099
#14 0x00007f2573b79bbd in svc_rqst_xprt_task_recv (wpe=0x7f25588132f0)
at /usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1209
#15 0x00007f2573b7a840 in svc_rqst_epoll_loop (wpe=0x7f256e91df18) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1608
#16 0x00007f2573b87cda in work_pool_thread (arg=0x7f255ec32da0) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/work_pool.c:190
#17 0x00007f2571e97ea5 in start_thread () from /lib64/libpthread.so.0
#18 0x00007f257179eb0d in clone () from /lib64/libc.so.6
*WE HAVE THESE FINDINGS* :
We destroy the session, if its ref count becomes 0.
Here we are destroying the mutex on line 263, and it gave a segmentation
fault.
On gdb we found that some slots were already destroyed in the session and
others were valid.
239 int32_t _dec_session_ref(nfs41_session_t
*session,constchar*func,intline)
240 {
241inti;
242int32_t refcnt = atomic_dec_int32_t(&session->refcount);
243 #ifdef USE_LTTNG
244tracepoint(nfs4, session_unref, func, line, session, refcnt);
245 #endif
246
247if(refcnt == 0) {
248
249/* Unlink the session from the client's list of
250 sessions */
251PTHREAD_MUTEX_lock(&session->clientid_record->cid_mutex);
252glist_del(&session->session_link);
253PTHREAD_MUTEX_unlock(&session->clientid_record->cid_mutex);
254
255/* Decrement our reference to the clientid record */
256dec_client_id_ref(session->clientid_record);
257/* Destroy this session's mutexes and condition variable */
258
259for(i = 0; i < session->nb_slots; i++) {
260nfs41_session_slot_t *slot;
261
262slot = &session->fc_slots[i];
263PTHREAD_MUTEX_destroy(&slot->lock);
264release_slot(slot);
265}
265 }
266
valid and freed slots :
(gdb) p *(session->fc_slots + 53 )
$84 = {sequence = 3, lock = {_data = {_lock = 0, __count = 0, __owner =
0, __nusers = 0, __kind = 0,
__spins = 0, __elision = 0, _list = {_prev = 0x0, __next = 0x0}}, __size
='\000'<repeats 39 times>,
__align = 0}, cached_result = 0x7f24f5c89dc0}
(gdb) p *(session->fc_slots + 54 )
$85 = {sequence = 0, lock = {_data = {_lock = 0, __count = 0, __owner =
0, __nusers = 0, __kind = 0,
__spins = 0, __elision = 0, _list = {_prev = 0x0, __next = 0x0}}, __size
='\000'<repeats 39 times>,
__align = 0}, cached_result = 0x0}
That may come from, if one thread is at line 372 and gets session but
does not increment refcount.
And then another thread decrements its session refcount and it becomes
0, so the session gets freed.
But for the first thread, it increments the refcount now to 373. So now
the first thread has an invalid session.
343intnfs41_Session_Get_Pointer(charsessionid[NFS4_SESSIONID_SIZE],
344nfs41_session_t **session_data)
345 {
346struct gsh_buffdesc key;
347struct gsh_buffdesc val;
348struct hash_latch latch;
349charstr[LOG_BUFF_LEN] ="\0";
350struct display_buffer dspbuf = {sizeof(str), str, str};
351bool str_valid =false;
352hash_error_t code;
353
354if(isFullDebug(COMPONENT_SESSIONS)) {
355display_session_id(&dspbuf, sessionid);
356LogFullDebug(COMPONENT_SESSIONS,"Get Session %s", str);
357str_valid =true;
358}
359
360key.addr = sessionid;
361key.len = NFS4_SESSIONID_SIZE;
362
363code = hashtable_getlatch(ht_session_id, &key, &val,false, &latch);
364if(code != HASHTABLE_SUCCESS) {
365hashtable_releaselatched(ht_session_id, &latch);
366if(str_valid)
367LogFullDebug(COMPONENT_SESSIONS,
368 "Session %s Not Found", str);
369return0;
370}
371
372*session_data = val.addr;
373inc_session_ref(*session_data);/* XXX more locks? */
374
375hashtable_releaselatched(ht_session_id, &latch);
376
377if(str_valid)
378LogFullDebug(COMPONENT_SESSIONS,"Session %s Found", str);
379
380return1;
381 }
We can avoid it by taking lock from line 363 to 373, as sessions should
not be freed in between.
_______________________________________________
Devel mailing list -- devel(a)lists.nfs-ganesha.org
To unsubscribe send an email to devel-leave(a)lists.nfs-ganesha.org