We Have seen this core dump twice on our lab environment, and not sure
about the root cause
(gdb) bt
#0 0x00007f2571e9f4fb in raise () from /lib64/libpthread.so.0
#1 0x00007f25737ab337 in crash_handler (signo=6, info=0x7f254e2c6970,
ctx=0x7f254e2c6840) at
/usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_init.c:256
#2 <signal handler called>
#3 0x00007f25716d6387 in raise () from /lib64/libc.so.6
#4 0x00007f25716d7a78 in abort () from /lib64/libc.so.6
#5 0x00007f2573814227 in _dec_session_ref (session=0x7f254a412e00,
func=0x7f2573922c70 <__func__.22740> "nfs4_op_destroy_session", line=97)
at /usr/src/debug/nfs-ganesha-4.0.2/SAL/nfs41_session_id.c:263
#6 0x00007f257386714b in nfs4_op_destroy_session (op=0x7f2556c380a0,
data=0x7f255302c800, resp=0x7f2556c38000)
at
/usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_op_destroy_session.c:97
#7 0x00007f257385fa43 in process_one_op (data=0x7f255302c800,
status=0x7f254e2c787c) at
/usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_Compound.c:924
#8 0x00007f2573860acb in nfs4_Compound (arg=0x7f219c59d880,
req=0x7f219c59d000, res=0x7f254a591100)
at /usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_Compound.c:1339
#9 0x00007f25737a5f80 in nfs_rpc_process_request (reqdata=0x7f219c59d000,
retry=false)
at /usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_worker_thread.c:2055
#10 0x00007f25737a65f9 in nfs_rpc_valid_NFS (req=0x7f219c59d000) at
/usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_worker_thread.c:2293
#11 0x00007f2573b7ead6 in svc_vc_decode (req=0x7f219c59d000) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_vc.c:1126
#12 0x00007f2573b79c59 in svc_request (xprt=0x7f2558813000,
xdrs=0x7f24e63bbe40) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1229
#13 0x00007f2573b7e9db in svc_vc_recv (xprt=0x7f2558813000) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_vc.c:1099
#14 0x00007f2573b79bbd in svc_rqst_xprt_task_recv (wpe=0x7f25588132f0) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1209
#15 0x00007f2573b7a840 in svc_rqst_epoll_loop (wpe=0x7f256e91df18) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1608
#16 0x00007f2573b87cda in work_pool_thread (arg=0x7f255ec32da0) at
/usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/work_pool.c:190
#17 0x00007f2571e97ea5 in start_thread () from /lib64/libpthread.so.0
#18 0x00007f257179eb0d in clone () from /lib64/libc.so.6
*WE HAVE THESE FINDINGS* :
We destroy the session, if its ref count becomes 0.
Here we are destroying the mutex on line 263, and it gave a segmentation
fault.
On gdb we found that some slots were already destroyed in the session and
others were valid.
239 int32_t _dec_session_ref(nfs41_session_t *session, const char *func, int
line)
240 {
241 int i;
242 int32_t refcnt = atomic_dec_int32_t(&session->refcount);
243 #ifdef USE_LTTNG
244 tracepoint(nfs4, session_unref, func, line, session, refcnt);
245 #endif
246
247 if (refcnt == 0) {
248
249 /* Unlink the session from the client's list of
250 sessions */
251
PTHREAD_MUTEX_lock(&session->clientid_record->cid_mutex);
252 glist_del(&session->session_link);
253
PTHREAD_MUTEX_unlock(&session->clientid_record->cid_mutex);
254
255 /* Decrement our reference to the clientid record */
256 dec_client_id_ref(session->clientid_record);
257 /* Destroy this session's mutexes and condition
variable */
258
259 for (i = 0; i < session->nb_slots; i++) {
260 nfs41_session_slot_t *slot;
261
262 slot = &session->fc_slots[i];
263 PTHREAD_MUTEX_destroy(&slot->lock);
264 release_slot(slot);
265 }
265 }
266
valid and freed slots :
(gdb) p *(session->fc_slots + 53 )
$84 = {sequence = 3, lock = {_data = {_lock = 0, __count = 0, __owner = 0,
__nusers = 0, __kind = 0,
__spins = 0, __elision = 0, _list = {_prev = 0x0, __next = 0x0}},
__size = '\000' <repeats 39 times>,
__align = 0}, cached_result = 0x7f24f5c89dc0}
(gdb) p *(session->fc_slots + 54 )
$85 = {sequence = 0, lock = {_data = {_lock = 0, __count = 0, __owner = 0,
__nusers = 0, __kind = 0,
__spins = 0, __elision = 0, _list = {_prev = 0x0, __next = 0x0}},
__size = '\000' <repeats 39 times>,
__align = 0}, cached_result = 0x0}
That may come from, if one thread is at line 372 and gets session but does
not increment refcount.
And then another thread decrements its session refcount and it becomes 0,
so the session gets freed.
But for the first thread, it increments the refcount now to 373. So now the
first thread has an invalid session.
343 int nfs41_Session_Get_Pointer(char sessionid[NFS4_SESSIONID_SIZE],
344 nfs41_session_t **session_data)
345 {
346 struct gsh_buffdesc key;
347 struct gsh_buffdesc val;
348 struct hash_latch latch;
349 char str[LOG_BUFF_LEN] = "\0";
350 struct display_buffer dspbuf = {sizeof(str), str, str};
351 bool str_valid = false;
352 hash_error_t code;
353
354 if (isFullDebug(COMPONENT_SESSIONS)) {
355 display_session_id(&dspbuf, sessionid);
356 LogFullDebug(COMPONENT_SESSIONS, "Get Session %s", str);
357 str_valid = true;
358 }
359
360 key.addr = sessionid;
361 key.len = NFS4_SESSIONID_SIZE;
362
363 code = hashtable_getlatch(ht_session_id, &key, &val, false,
&latch);
364 if (code != HASHTABLE_SUCCESS) {
365 hashtable_releaselatched(ht_session_id, &latch);
366 if (str_valid)
367 LogFullDebug(COMPONENT_SESSIONS,
368 "Session %s Not Found", str);
369 return 0;
370 }
371
372 *session_data = val.addr;
373 inc_session_ref(*session_data); /* XXX more locks? */
374
375 hashtable_releaselatched(ht_session_id, &latch);
376
377 if (str_valid)
378 LogFullDebug(COMPONENT_SESSIONS, "Session %s Found",
str);
379
380 return 1;
381 }
We can avoid it by taking lock from line 363 to 373, as sessions should not
be freed in between.