disk LRU hang due to object read being blocked by busyobj allocation
This situation was observed during stress testing:
- Object read from dsk lru waits for memory
§3 0x00007ff75eab7473 in buddy_reqs_wait_allocated (w=0x7ff7413f3460) at buddy.c:1909
§4 buddy_alloc_async_wait (reqs=0x7ff7413f3440) at buddy.c:2090
§5 0x00007ff75ea90aba in fellow_cache_obj_new (fc=fc@entry=0x7ff75e6d9900, dsk_sz=dsk_sz@entry=4096, nseg_guess=nseg_guess@entry=64,
fbo_mem=fbo_mem@entry=0x0, dowry=0x0, pri=<optimized out>) at fellow_cache.c:3476
§6 0x00007ff75eaa69ad in fellow_cache_obj_prepread (crit=0, fdba=..., fc=0x7ff75e6d9900) at fellow_cache.c:7353
§7 fellow_cache_obj_get (fc=0x7ff75e6d9900, ocp=ocp@entry=0x7ff7413f3840, priv2=priv2@entry=2413341433857, crit=crit@entry=1)
at fellow_cache.c:7492
§8 0x00007ff75ea4c09d in stvfe_dskoc_fco (stv=0x7ff75e64c8c0, stvfe=stvfe@entry=0x7ff75e63e000, oc=0x7fe783ad3440, crit=crit@entry=1,
wrk=<optimized out>, wrk=<optimized out>) at fellow_storage.c:446
§9 0x00007ff75ea518f6 in sfedsk_objslim (wrk=<optimized out>, dskoc=<optimized out>) at fellow_storage.c:691
§10 0x0000000000469a7e in ObjSlim (wrk=0x7ff7413fc5e8, oc=0x7fe783ad3440) at cache/cache_obj.c:374
§11 0x00000000005022c4 in LRU_NukeOne (wrk=0x7ff7413fc5e8, lru=0x7ff75e628a60) at storage/storage_lru.c:205
...
(gdb) fr 5
#5 0x00007ff75ea90aba in fellow_cache_obj_new (fc=fc@entry=0x7ff75e6d9900, dsk_sz=dsk_sz@entry=4096, nseg_guess=nseg_guess@entry=64,
fbo_mem=fbo_mem@entry=0x0, dowry=0x0, pri=<optimized out>) at fellow_cache.c:3476
3476 u = buddy_alloc_wait(reqs);
(gdb) p dsk_sz
$1 = 4096
(gdb) p mem_sz
$2 = 4096
- Highest priority memory request is for the segmem pool allocation of a cache miss
(gdb) p *reqs->buddy
$4 = {magic = 344463722, area = 0x7feac0000000 "\200*\277@\367\177",
end = 0x7ff740000000 <error: Cannot access memory at address 0x7ff740000000>, map_mtx = pthread_mutex_t = {Type = Normal,
Status = Not acquired, Robust = No, Shared = No, Protocol = None}, map = 0x7ff7450d2000, minfo_mtx = pthread_mutex_t = {
Type = Normal, Status = Not acquired, Robust = No, Shared = No, Protocol = None}, minfo_head = {rbh_root = 0x0},
rsv_avail = 49456140288, deficit = 17612800, waiting = 364, wait_working = 0, wait_pri = 5, reqs_head = {{vtqh_first = 0x0,
vtqh_last = 0x7ff75e63e0e0}, {vtqh_first = 0x0, vtqh_last = 0x7ff75e63e0f0}, {vtqh_first = 0x7ff7562c92e0,
vtqh_last = 0x7fcf58975360}, {vtqh_first = 0x7ff7413f3440, vtqh_last = 0x7ff743152340}, {vtqh_first = 0x7ff740bf2910,
vtqh_last = 0x7ff740bf2870}, {vtqh_first = 0x7ff4c532c2c0, vtqh_last = 0x7ff4c532c258}, {vtqh_first = 0x0,
vtqh_last = 0x7ff75e63e140}, {vtqh_first = 0x0, vtqh_last = 0x7ff75e63e150}, {vtqh_first = 0x0, vtqh_last = 0x7ff75e63e160}},
kick_cond = pthread_cond_t = {Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME, Shared = No}}
(gdb) p *reqs->buddy->reqs_head[5].vtqh_first
$6 = {magic = 1059802476, space = 1 '\001', n = 1 '\001', pri = 5 '\005', buddy = 0x7ff75e63e048, sz = 32768, map = 0x7ff7450d2000,
i_wait = {magic = 2137195452, finid = 0 '\000', next = 0 '\000', wait_mtx = pthread_mutex_t = {Type = Normal, Status = Not acquired,
Robust = No, Shared = No, Protocol = None}, wait_cond = pthread_cond_t = {Threads known to still execute a wait function = 1,
Clock ID = CLOCK_REALTIME, Shared = No}, list = {vtqe_next = 0x7ff4c532c1d8, vtqe_prev = 0x7ff75e63e130}, pri = 5 '\005',
alloced = 0 '\000', state = IW_WAITING}, i_reqalloc = 0x7ff4c532c288,
func = 0x7ff75ead6560 <__func__.14480> "fellow_busy_obj_alloc", line = 3733}