Hi All,
 
A customer reported a crash in mdcache_readdir_chunked(..). It is happening when getattrs(..) is called for an entry associated with a dirent.
  1. (gdb) print entry->obj_handle.obj_ops.getattrs   ==========> This is pointing to unexpected address
  2. $4 = (fsal_status_t (*)(struct fsal_obj_handle *, struct attrlist *)) 0x7f5a100001b8
Customer is using ganesha 2.5.3 which doesn't have some patches from ganesha 2.7.6. At end of the mail is the backtrace where:
1. the 'chunk' structure fields has valid values
2. but the currently handled 'dirent' structure fields has unexpected values. 
 
Is it possible that this may have happened because dirent->chunk may be pointing to a chunk which is different than the currently handled chunk in mdcache_readir_chunked(..) code. And because of that we don't have ref taken for dirent->chunk and there is a possibility of chunk pointed by dirent->chunk getting reaped ?
 
We don't have below patches from 2.7.6, can any of them may help to fix the crash ?  Or any other recent patches that may help ?
a. MDCACHE - Fix chunk refcounting in readdir
b. MDCACHE - Drop chunk ref before dropping lock
c. MDCACHE - Update chunk pointers when splitting a chunk 
d. MDCACHE - Restart readdir if directory is invalidated
e. MDCACHE - Hold lock while dropping ref on chunk

 

Backtrace for reference:
  1. (gdb) bt
  2. #0  0x00007f6712d874ab in raise () from /lib64/libpthread.so.0
  3. #1  0x00000000004559d8 in crash_handler (signo=11, info=0x7f622c6567b0, ctx=0x7f622c656680) at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/MainNFSD/nfs_init.c:225
  4. #2  <signal handler called>
  5. #3  0x00007f5a100001b8 in ?? ()
  6. #4  0x0000000000553012 in mdcache_readdir_chunked (directory=0x7f5994002890, whence=0, dir_state=0x7f622c6577c0, cb=0x4337cb <populate_dirent>, attrmask=122830,
  7.     eod_met=0x7f622c657e9b) at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3354
  8. #5  0x000000000053ffe3 in mdcache_readdir (dir_hdl=0x7f59940028c8, whence=0x7f622c6577a0, dir_state=0x7f622c6577c0, cb=0x4337cb <populate_dirent>, attrmask=122830,
  9.     eod_met=0x7f622c657e9b) at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_handle.c:639
  10. #6  0x00000000004340af in fsal_readdir (directory=0x7f59940028c8, cookie=0, nbfound=0x7f622c657e9c, eod_met=0x7f622c657e9b, attrmask=122830,
  11.     cb=0x496f5d <nfs3_readdirplus_callback>, opaque=0x7f622c657e50) at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/FSAL/fsal_helper.c:1504
  12. #7  0x0000000000496d50 in nfs3_readdirplus (arg=0x7f598047d190, req=0x7f598047c988, res=0x7f5b14232f70)
  13.     at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/Protocols/NFS/nfs3_readdirplus.c:309
  14. #8  0x000000000044d28f in nfs_rpc_execute (reqdata=0x7f598047c960) at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/MainNFSD/nfs_worker_thread.c:1290
  15. #9  0x000000000044dbc6 in worker_run (ctx=0x3f481e0) at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/MainNFSD/nfs_worker_thread.c:1593
  16. #10 0x000000000050c103 in fridgethr_start_routine (arg=0x3f481e0) at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/support/fridgethr.c:550
  17. #11 0x00007f6712d7fe25 in start_thread () from /lib64/libpthread.so.0
  18. #12 0x00007f671244334d in clone () from /lib64/libc.so.6
  19.  
  20. (gdb) frame 4
  21. #4  0x0000000000553012 in mdcache_readdir_chunked (directory=0x7f5994002890, whence=0, dir_state=0x7f622c6577c0, cb=0x4337cb <populate_dirent>, attrmask=122830,
  22.     eod_met=0x7f622c657e9b) at /usr/src/debug/nfs-ganesha-2.5.3-ibm036.15-0.1.1-Source/FSAL/Stackable_FSALs/FSAL_MDCACHE/mdcache_helpers.c:3354
  23. 3354                    status = entry->obj_handle.obj_ops.getattrs(&entry->obj_handle,   =======================> This is where the crash happens
  24.  
  25. (gdb) p  entry->obj_handle.obj_ops.getattrs   ===============================> This is pointing to unexpected address
  26. $4 = (fsal_status_t (*)(struct fsal_obj_handle *, struct attrlist *)) 0x7f5a100001b8
  27. (gdb) l
  28. 3349                     * this is to call getattrs().  We need a copy anyway, to ensure
  29. 3350                     * thread safety.
  30. 3351                     */
  31. 3352                    fsal_prepare_attrs(&attrs, attrmask);
  32. 3353
  33. 3354                    status = entry->obj_handle.obj_ops.getattrs(&entry->obj_handle,
  34. 3355                                                                &attrs);
  35. 3356                    if (FSAL_IS_ERROR(status)) {
  36. 3357                            LogFullDebugAlt(COMPONENT_NFS_READDIR,
  37. 3358                                            COMPONENT_CACHE_INODE,
 
(gdb) p has_write
$1 = true

 
  1. (gdb) p dirent                        
  2. $5 = (mdcache_dir_entry_t *) 0x7f5a10000078
  3. (gdb) p *dirent    =====================================================>  This dirent structure has fields with unexpected values
  4. $6 = {chunk_list = {next = 0x7f5a10486ce0, prev = 0x7f5a10481f70}, chunk = 0x7f5a10484240, node_hk = {left = 0x7f5a10423920, right = 0x7f5a10005860, parent = 140024792324528},
  5.   node_ck = {left = 0x7f5a104822a0, right = 0x7f5a104822a0, parent = 140024792219816}, node_sorted = {left = 0x7f5a100000a8, right = 0x7f5a1040ee80, parent = 140024796475008},
  6.   ck = 140024792219848, eod = 200, hk = {k = 140024792219864, p = 268435672}, ckey = {hk = 140024792219880, fsal = 0x7f5a100000e8, kv = {addr = 0x7f5a104831e0,
  7.       len = 140024796951008}}, flags = 268435720, entry = 0x0, name = 0x7f5a10000128 "\030\001"}
  8.  
  9. (gdb) p chunk
  10. $7 = (struct dir_chunk *) 0x7f5a104227d0
  11. (gdb) p *chunk  ======================================================> But the chunk looks okay, I searched the chunk->dirents list but didn't find address of the above dirent
  12. $8 = {chunks = {next = 0x7f5994002d00, prev = 0x7f5994002d00}, dirents = {next = 0x7f5a10423140, prev = 0x7f5a10426050}, parent = 0x7f5994002890, chunk_lru = {q = {
  13.       next = 0x84c040 <CHUNK_LRU+105952>, prev = 0x7f5bdc2a87b8}, qid = LRU_ENTRY_L1, refcnt = 1, flags = 0, lane = 473, cf = 0}, reload_ck = 0, next_ck = 0, num_entries = 48}
  14. (gdb)
  15.  
  16. (gdb) p entry->obj_handle   ==============================================> This entry has unexpected values
  17. $2 = {handles = {next = 0x7f5a10000128, prev = 0x7f5a10000138}, fs = 0x7f5a10000138, fsal = 0x7f5a10000148, obj_ops = {get_ref = 0x7f5a10000148, put_ref = 0x7f5a10000158,
  18.     release = 0x7f5a10000158, merge = 0x7f5a10000168, lookup = 0x7f5a10000168, readdir = 0x7f5a10481ef0, compute_readdir_cookie = 0x7f5a10481b30, dirent_cmp = 0x7f5a10000188,
  19.     create = 0x7f5a10000188, mkdir = 0x7f5a10000198, mknode = 0x7f5a10000198, symlink = 0x7f5a100001a8, readlink = 0x7f5a100001a8, test_access = 0x7f5a100001b8,
  20.     getattrs = 0x7f5a100001b8, setattrs = 0x7f5a100001c8, link = 0x7f5a100001c8, fs_locations = 0x7f5a100001d8, rename = 0x7f5a100001d8, unlink = 0x7f5a100001e8,
  21.     open = 0x7f5a100001e8, reopen = 0x7f5a100001f8, status = 0x7f5a100001f8, read = 0x7f5a10000208, read_plus = 0x7f5a10000208, write = 0x7f5a10000218,
  22.     write_plus = 0x7f5a10000218, seek = 0x7f5a10000228, io_advise = 0x7f5a10000228, commit = 0x7f5a10000238, lock_op = 0x7f5a10000238, share_op = 0x7f5a10000248,
  23.     close = 0x7f5a10000248, list_ext_attrs = 0x7f5a10000258, getextattr_id_by_name = 0x7f5a10000258, getextattr_value_by_name = 0x7f5a10000268,
  24.     getextattr_value_by_id = 0x7f5a10000268, setextattr_value = 0x7f5a10000278, setextattr_value_by_id = 0x7f5a10000278, remove_extattr_by_id = 0x7f5a10000288,
  25.     remove_extattr_by_name = 0x7f5a10000288, handle_is = 0x7f5a10000298, handle_to_wire = 0x7f5a10000298, handle_to_key = 0x7f5a100002a8, handle_cmp = 0x7f5a100002a8,
  26.     layoutget = 0x7f5a100002b8, layoutreturn = 0x7f5a100002b8, layoutcommit = 0x7f5a100002c8, getxattrs = 0x7f5a100002c8, setxattrs = 0x7f5a100002d8,
  27.     removexattrs = 0x7f5a100002d8, listxattrs = 0x7f5a100002e8, open2 = 0x7f5a100002e8, check_verifier = 0x7f5a100002f8, status2 = 0x7f5a100002f8, reopen2 = 0x7f5a10000308,
  28.     read2 = 0x7f5a10000308, write2 = 0x7f5a10000318, seek2 = 0x7f5a10000318, io_advise2 = 0x7f5a10000328, commit2 = 0x7f5a10000328, lock_op2 = 0x7f5a10000338,
  29.     setattr2 = 0x7f5a10000338, close2 = 0x7f5a10000348}, obj_lock = {__data = {__lock = 268436296, __nr_readers = 32602, __readers_wakeup = 268436312, __writer_wakeup = 32602,
  30.       __nr_readers_queued = 268436312, __nr_writers_queued = 32602, __writer = 268436328, __shared = 32602, __pad1 = 140024792220520, __pad2 = 140024792220536,
  31.       __flags = 268436344},
  32.     __size = "H\003\000\020Z\177\000\000X\003\000\020Z\177\000\000X\003\000\020Z\177\000\000h\003\000\020Z\177\000\000h\003\000\020Z\177\000\000x\003\000\020Z\177\000\000x\003\000\020Z\177\000", __align = 140024792220488}, type = 268436360, fsid = {major = 140024792220552, minor = 140024792220568}, fileid = 140024792220568, state_hdl = 0x7f5a100003a8}
  33.  
  34. Please note:
  35. In the coredump this is the only thread doing readdir operation.
Thanks,
Madhu Thorat.