We Have seen this core dump  twice on our lab environment, and not sure about the root cause


(gdb) bt

#0 0x00007f2571e9f4fb in raise () from /lib64/libpthread.so.0

#1 0x00007f25737ab337 in crash_handler (signo=6, info=0x7f254e2c6970, ctx=0x7f254e2c6840) at /usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_init.c:256

#2 <signal handler called>

#3 0x00007f25716d6387 in raise () from /lib64/libc.so.6

#4 0x00007f25716d7a78 in abort () from /lib64/libc.so.6

#5 0x00007f2573814227 in _dec_session_ref (session=0x7f254a412e00, func=0x7f2573922c70 <__func__.22740> "nfs4_op_destroy_session", line=97)

  at /usr/src/debug/nfs-ganesha-4.0.2/SAL/nfs41_session_id.c:263

#6 0x00007f257386714b in nfs4_op_destroy_session (op=0x7f2556c380a0, data=0x7f255302c800, resp=0x7f2556c38000)

  at /usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_op_destroy_session.c:97

#7 0x00007f257385fa43 in process_one_op (data=0x7f255302c800, status=0x7f254e2c787c) at /usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_Compound.c:924

#8 0x00007f2573860acb in nfs4_Compound (arg=0x7f219c59d880, req=0x7f219c59d000, res=0x7f254a591100)

  at /usr/src/debug/nfs-ganesha-4.0.2/Protocols/NFS/nfs4_Compound.c:1339

#9 0x00007f25737a5f80 in nfs_rpc_process_request (reqdata=0x7f219c59d000, retry=false)

  at /usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_worker_thread.c:2055

#10 0x00007f25737a65f9 in nfs_rpc_valid_NFS (req=0x7f219c59d000) at /usr/src/debug/nfs-ganesha-4.0.2/MainNFSD/nfs_worker_thread.c:2293

#11 0x00007f2573b7ead6 in svc_vc_decode (req=0x7f219c59d000) at /usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_vc.c:1126

#12 0x00007f2573b79c59 in svc_request (xprt=0x7f2558813000, xdrs=0x7f24e63bbe40) at /usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1229

#13 0x00007f2573b7e9db in svc_vc_recv (xprt=0x7f2558813000) at /usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_vc.c:1099

#14 0x00007f2573b79bbd in svc_rqst_xprt_task_recv (wpe=0x7f25588132f0) at /usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1209

#15 0x00007f2573b7a840 in svc_rqst_epoll_loop (wpe=0x7f256e91df18) at /usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/svc_rqst.c:1608

#16 0x00007f2573b87cda in work_pool_thread (arg=0x7f255ec32da0) at /usr/src/debug/nfs-ganesha-4.0.2/libntirpc/src/work_pool.c:190

#17 0x00007f2571e97ea5 in start_thread () from /lib64/libpthread.so.0

#18 0x00007f257179eb0d in clone () from /lib64/libc.so.6 


WE HAVE THESE FINDINGS : 

We destroy the session, if its ref count becomes 0.
Here we are destroying the mutex on line 263, and it gave a segmentation fault.
On gdb we found that some slots were already destroyed in the session and
others were valid.

239 int32_t _dec_session_ref(nfs41_session_t *session, const char *func, int line)

240 {

241         int i;

242         int32_t refcnt = atomic_dec_int32_t(&session->refcount);

243 #ifdef USE_LTTNG

244         tracepoint(nfs4, session_unref, func, line, session, refcnt);

245 #endif

246

247         if (refcnt == 0) {

248

249                 /* Unlink the session from the client's list of

250                    sessions */

251                 PTHREAD_MUTEX_lock(&session->clientid_record->cid_mutex);

252                 glist_del(&session->session_link);

253                 PTHREAD_MUTEX_unlock(&session->clientid_record->cid_mutex);

254

255                 /* Decrement our reference to the clientid record */

256                 dec_client_id_ref(session->clientid_record);

257                 /* Destroy this session's mutexes and condition variable */

258

259                 for (i = 0; i < session->nb_slots; i++) {

260                         nfs41_session_slot_t *slot;

261

262                         slot = &session->fc_slots[i];

263                         PTHREAD_MUTEX_destroy(&slot->lock);

264                         release_slot(slot);

265                 }

265 } 

266 

valid and freed slots :

(gdb) p *(session->fc_slots + 53 )

$84 = {sequence = 3, lock = {_data = {_lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0,

      __spins = 0, __elision = 0, _list = {_prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>,

    __align = 0}, cached_result = 0x7f24f5c89dc0}

(gdb) p *(session->fc_slots + 54 )

$85 = {sequence = 0, lock = {_data = {_lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0,

      __spins = 0, __elision = 0, _list = {_prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>,

    __align = 0}, cached_result = 0x0}

That may come from, if one thread is at line 372 and gets session but does not increment refcount.
And then another thread decrements its session refcount and it becomes 0, so the session gets freed.
But for the first thread, it increments the refcount now to 373. So now the first thread has an invalid session.

343 int nfs41_Session_Get_Pointer(char sessionid[NFS4_SESSIONID_SIZE],

344                               nfs41_session_t **session_data)

345 {

346         struct gsh_buffdesc key;

347         struct gsh_buffdesc val;

348         struct hash_latch latch;

349         char str[LOG_BUFF_LEN] = "\0";

350         struct display_buffer dspbuf = {sizeof(str), str, str};

351         bool str_valid = false;

352         hash_error_t code;

353

354         if (isFullDebug(COMPONENT_SESSIONS)) {

355                 display_session_id(&dspbuf, sessionid);

356                 LogFullDebug(COMPONENT_SESSIONS, "Get Session %s", str);

357                 str_valid = true;

358         }

359

360         key.addr = sessionid;

361         key.len = NFS4_SESSIONID_SIZE;

362

363         code = hashtable_getlatch(ht_session_id, &key, &val, false, &latch);

364         if (code != HASHTABLE_SUCCESS) {

365                 hashtable_releaselatched(ht_session_id, &latch);

366                 if (str_valid)

367                         LogFullDebug(COMPONENT_SESSIONS,

368                                      "Session %s Not Found", str);

369                 return 0;

370         }

371

372         *session_data = val.addr;

373         inc_session_ref(*session_data); /* XXX more locks? */

374

375         hashtable_releaselatched(ht_session_id, &latch);

376

377         if (str_valid)

378                 LogFullDebug(COMPONENT_SESSIONS, "Session %s Found", str);

379

380         return 1;

381 }

We can avoid it by taking lock from line 363 to 373, as sessions should not be freed in between.