Add alerting for Sidekiq "stopping" state
After we a deploy, we often see Sidekiq in "stopping" state with no new jobs being handled. We should be notified if this doesn't exit this state within some time (e.g. 10 minutes).
I ran gdb on one of these stuck Sidekiq processes and saw this:
(gdb) thread apply all bt
Thread 4 (Thread 0x7f6306df8700 (LWP 8481)):
#0 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:225
#1 0x00007f6308796060 in native_cond_timedwait (ts=<optimized out>, mutex=<optimized out>, cond=<optimized out>) at thread_pthread.c:359
#2 native_sleep (th=0x7f62e8e37800, timeout_tv=0x7f6306df6a30) at thread_pthread.c:1144
#3 0x00007f630879c56e in sleep_timeval (spurious_check=1, tv=..., th=0x7f62e8e37800) at thread.c:1111
#4 rb_thread_wait_for (time=...) at thread.c:1172
#5 0x00007f63086a9a30 in rb_f_sleep (argc=1, argv=0x7f62e2e0d510) at process.c:4382
#6 0x00007f6308775250 in vm_call_cfunc_with_frame (ci=<optimized out>, cc=<optimized out>, calling=<optimized out>, reg_cfp=0x7f62e2f0d340, th=0x7f62e8e37800) at vm_insnhelper.c:1638
#7 vm_call_cfunc (th=0x7f62e8e37800, reg_cfp=0x7f62e2f0d340, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:1733
#8 0x00007f630877e628 in vm_exec_core (th=th@entry=0x7f62e8e37800, initial=initial@entry=0) at insns.def:995
#9 0x00007f6308783186 in vm_exec (th=th@entry=0x7f62e8e37800) at vm.c:1650
#10 0x00007f63087869ea in vm_call0_body (th=0x7f62e8e37800, calling=calling@entry=0x7f6306df7200, ci=ci@entry=0x7f6306df71f0, cc=cc@entry=0x7f6306df7220, argv=0x7f62f4b72098, argv@entry=0x7f62eedf12d0) at vm_eval.c:182
#11 0x00007f6308786ee0 in vm_call0 (me=<optimized out>, argv=argv@entry=0x7f62eedf12d0, argc=argc@entry=32611, id=<optimized out>, recv=<optimized out>, th=<optimized out>) at vm_eval.c:61
#12 rb_vm_call (th=<optimized out>, recv=<optimized out>, id=<optimized out>, argc=argc@entry=0, argv=argv@entry=0x7f62f4b72098, me=<optimized out>) at vm_eval.c:264
#13 0x00007f630862f84d in rb_method_call_with_block (argc=argc@entry=0, argv=0x7f62f4b72098, method=method@entry=140062991079880, pass_procval=pass_procval@entry=8) at proc.c:1990
#14 0x00007f630862f9f5 in bmcall (args=140062989164680, method=140062991079880, argc=0, argv=<optimized out>, passed_proc=8) at proc.c:2509
#15 0x00007f6308779b94 in vm_yield_with_cfunc (th=th@entry=0x7f62e8e37800, block=block@entry=0x7f62f97df010, self=140062991079880, argc=0, argv=0x7f62e2e0d4c8, blockargptr=blockargptr@entry=0x0) at vm_insnhelper.c:2334
#16 0x00007f630877e8df in vm_invoke_block (ci=<optimized out>, calling=0x7f6306df7530, reg_cfp=0x7f62e2f0d3c0, th=0x7f62e8e37800) at vm_insnhelper.c:2463
#17 vm_exec_core (th=th@entry=0x7f62e8e37800, initial=initial@entry=0) at insns.def:1034
#18 0x00007f6308783186 in vm_exec (th=th@entry=0x7f62e8e37800) at vm.c:1650
#19 0x00007f6308783e2e in invoke_block (block=0x7f62f97df280, opt_pc=<optimized out>, type=<optimized out>, cref=0x0, self=140062990747520, iseq=0x7f6301eac980, th=0x7f62e8e37800) at vm.c:921
#20 invoke_block_from_c_0 (th=0x7f62e8e37800, block=0x7f62f97df280, self=140062990747520, argc=<optimized out>, argv=<optimized out>, blockptr=<optimized out>, cref=0x0, splattable=0) at vm.c:971
#21 0x00007f6308783f03 in invoke_block_from_c_unsplattable (cref=0x0, blockptr=0x0, argv=0x7f62f4d45848, argc=0, self=140062990747520, block=0x7f62f97df280, th=0x7f62e8e37800) at vm.c:996
#22 vm_invoke_proc (th=th@entry=0x7f62e8e37800, proc=proc@entry=0x7f62f97df280, self=140062990747520, argc=0, argv=0x7f62f4d45848, blockptr=blockptr@entry=0x0) at vm.c:1044
#23 0x00007f6308783f98 in rb_vm_invoke_proc (th=th@entry=0x7f62e8e37800, proc=proc@entry=0x7f62f97df280, argc=<optimized out>, argv=<optimized out>, blockptr=blockptr@entry=0x0) at vm.c:1072
#24 0x00007f6308799c71 in thread_start_func_2 (th=th@entry=0x7f62e8e37800, stack_start=<optimized out>) at thread.c:584
#25 0x00007f630879a230 in thread_start_func_1 (th_ptr=0x7f62e8e37800) at thread_pthread.c:882
#26 0x00007f6307fc270a in start_thread (arg=0x7f6306df8700) at pthread_create.c:333
#27 0x00007f63082de82d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
Thread 3 (Thread 0x7f6308ee0700 (LWP 6096)):
#0 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:225
#1 0x00007f6308796060 in native_cond_timedwait (ts=<optimized out>, mutex=<optimized out>, cond=<optimized out>) at thread_pthread.c:359
#2 native_sleep (th=0x7f62f860f800, timeout_tv=0x7f6308ede9c0) at thread_pthread.c:1144
#3 0x00007f630879c56e in sleep_timeval (spurious_check=1, tv=..., th=0x7f62f860f800) at thread.c:1111
#4 rb_thread_wait_for (time=...) at thread.c:1172
#5 0x00007f63086a9a30 in rb_f_sleep (argc=1, argv=0x7f62e9e0d090) at process.c:4382
#6 0x00007f6308775250 in vm_call_cfunc_with_frame (ci=<optimized out>, cc=<optimized out>, calling=<optimized out>, reg_cfp=0x7f62e9f0cf40, th=0x7f62f860f800) at vm_insnhelper.c:1638
#7 vm_call_cfunc (th=0x7f62f860f800, reg_cfp=0x7f62e9f0cf40, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:1733
#8 0x00007f630877e628 in vm_exec_core (th=th@entry=0x7f62f860f800, initial=initial@entry=0) at insns.def:995
#9 0x00007f6308783186 in vm_exec (th=th@entry=0x7f62f860f800) at vm.c:1650
#10 0x00007f6308788a02 in invoke_block (block=<optimized out>, opt_pc=<optimized out>, type=<optimized out>, cref=0x0, self=<optimized out>, iseq=<optimized out>, th=<optimized out>) at vm.c:921
#11 invoke_block_from_c_0 (splattable=1, cref=0x0, blockptr=0x0, argv=0x0, argc=0, self=<optimized out>, block=<optimized out>, th=<optimized out>) at vm.c:971
#12 invoke_block_from_c_splattable (cref=0x0, blockptr=0x0, argv=0x0, argc=0, self=<optimized out>, block=<optimized out>, th=<optimized out>) at vm.c:988
#13 vm_yield (argc=0, argv=0x0, th=<optimized out>) at vm.c:1023
#14 rb_yield_0 (argv=0x0, argc=0) at vm_eval.c:1010
#15 loop_i () at vm_eval.c:1087
#16 0x00007f6308626c2e in rb_rescue2 (b_proc=0x7f6308788720 <loop_i>, data1=0, r_proc=0x7f6308770a00 <loop_stop>, data2=0) at eval.c:802
---Type <return> to continue, or q <return> to quit---
#17 0x00007f6308775250 in vm_call_cfunc_with_frame (ci=<optimized out>, cc=<optimized out>, calling=<optimized out>, reg_cfp=0x7f62e9f0cfc0, th=0x7f62f860f800) at vm_insnhelper.c:1638
#18 vm_call_cfunc (th=0x7f62f860f800, reg_cfp=0x7f62e9f0cfc0, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:1733
#19 0x00007f6308784bfe in vm_call_method_each_type (th=th@entry=0x7f62f860f800, cfp=cfp@entry=0x7f62e9f0cfc0, calling=0x7f6308edf4d0, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:2022
#20 0x00007f6308785173 in vm_call_method (th=0x7f62f860f800, cfp=0x7f62e9f0cfc0, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:2172
#21 0x00007f630877e55b in vm_exec_core (th=th@entry=0x7f62f860f800, initial=initial@entry=0) at insns.def:964
#22 0x00007f6308783186 in vm_exec (th=th@entry=0x7f62f860f800) at vm.c:1650
#23 0x00007f6308783e2e in invoke_block (block=0x7f62f8045150, opt_pc=<optimized out>, type=<optimized out>, cref=0x0, self=140062986306720, iseq=0x7f62f48b9f08, th=0x7f62f860f800) at vm.c:921
#24 invoke_block_from_c_0 (th=0x7f62f860f800, block=0x7f62f8045150, self=140062986306720, argc=<optimized out>, argv=<optimized out>, blockptr=<optimized out>, cref=0x0, splattable=0) at vm.c:971
#25 0x00007f6308783f03 in invoke_block_from_c_unsplattable (cref=0x0, blockptr=0x0, argv=0x7f62f48d50b0, argc=0, self=140062986306720, block=0x7f62f8045150, th=0x7f62f860f800) at vm.c:996
#26 vm_invoke_proc (th=th@entry=0x7f62f860f800, proc=proc@entry=0x7f62f8045150, self=140062986306720, argc=0, argv=0x7f62f48d50b0, blockptr=blockptr@entry=0x0) at vm.c:1044
#27 0x00007f6308783f98 in rb_vm_invoke_proc (th=th@entry=0x7f62f860f800, proc=proc@entry=0x7f62f8045150, argc=<optimized out>, argv=<optimized out>, blockptr=blockptr@entry=0x0) at vm.c:1072
#28 0x00007f6308799c71 in thread_start_func_2 (th=th@entry=0x7f62f860f800, stack_start=<optimized out>) at thread.c:584
#29 0x00007f630879a230 in thread_start_func_1 (th_ptr=0x7f62f860f800) at thread_pthread.c:882
#30 0x00007f6307fc270a in start_thread (arg=0x7f6308ee0700) at pthread_create.c:333
#31 0x00007f63082de82d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
Thread 2 (Thread 0x7f6308eef700 (LWP 5010)):
#0 0x00007f63082d2b5d in poll () at ../sysdeps/unix/syscall-template.S:84
#1 0x00007f630879500a in timer_thread_sleep (gvl=0x7f6307420008) at thread_pthread.c:1455
#2 thread_timer (p=0x7f6307420008) at thread_pthread.c:1563
#3 0x00007f6307fc270a in start_thread (arg=0x7f6308eef700) at pthread_create.c:333
#4 0x00007f63082de82d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
Thread 1 (Thread 0x7f6308ee1780 (LWP 5004)):
#0 0x00007f63082d49e3 in select () at ../sysdeps/unix/syscall-template.S:84
#1 0x00007f630879d8ec in rb_fd_select (n=<optimized out>, readfds=<optimized out>, writefds=<optimized out>, exceptfds=<optimized out>, timeout=<optimized out>) at thread.c:3418
#2 0x00007f630879dc10 in native_fd_select (th=0x7f630741c000, timeout=0x0, exceptfds=0x0, writefds=0x0, readfds=0x7ffee739bd10, n=16) at thread_pthread.c:1090
#3 do_select (timeout=<optimized out>, exceptfds=0x0, writefds=0x0, readfds=0x7ffee739bd10, n=16) at thread.c:3557
#4 rb_thread_fd_select (max=max@entry=16, read=read@entry=0x7ffee739bd10, write=write@entry=0x0, except=except@entry=0x0, timeout=timeout@entry=0x0) at thread.c:3630
#5 0x00007f63086555a7 in select_internal (fds=0x7ffee739bd10, tp=0x0, except=<optimized out>, write=<optimized out>, read=<optimized out>) at io.c:8434
#6 select_call (arg=arg@entry=140732777741552) at io.c:8504
#7 0x00007f6308627090 in rb_ensure (b_proc=b_proc@entry=0x7f63086552b0 <select_call>, data1=data1@entry=140732777741552, e_proc=e_proc@entry=0x7f6308651e90 <select_end>, data2=data2@entry=140732777741552) at eval.c:901
#8 0x00007f63086520a0 in rb_f_select (argc=<optimized out>, argv=<optimized out>, obj=<optimized out>) at io.c:8853
#9 0x00007f6308775250 in vm_call_cfunc_with_frame (ci=<optimized out>, cc=<optimized out>, calling=<optimized out>, reg_cfp=0x7f630752bc00, th=0x7f630741c000) at vm_insnhelper.c:1638
#10 vm_call_cfunc (th=0x7f630741c000, reg_cfp=0x7f630752bc00, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:1733
#11 0x00007f630877e628 in vm_exec_core (th=th@entry=0x7f630741c000, initial=initial@entry=0) at insns.def:995
#12 0x00007f6308783186 in vm_exec (th=0x7f630741c000) at vm.c:1650
#13 0x00007f63087865a8 in rb_iseq_eval (iseq=<optimized out>) at vm.c:1882
#14 0x00007f6308629cf9 in rb_load_internal0 (th=th@entry=0x7f630741c000, fname=140063213673720, wrap=<optimized out>) at load.c:619
#15 0x00007f630862a3b6 in rb_load_internal (wrap=<optimized out>, fname=<optimized out>) at load.c:649
#16 rb_f_load (argc=<optimized out>, argv=<optimized out>) at load.c:717
#17 0x00007f6308775250 in vm_call_cfunc_with_frame (ci=<optimized out>, cc=<optimized out>, calling=<optimized out>, reg_cfp=0x7f630752bcc0, th=0x7f630741c000) at vm_insnhelper.c:1638
#18 vm_call_cfunc (th=0x7f630741c000, reg_cfp=0x7f630752bcc0, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:1733
#19 0x00007f6308784bfe in vm_call_method_each_type (th=th@entry=0x7f630741c000, cfp=cfp@entry=0x7f630752bcc0, calling=0x7ffee739c750, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:2022
#20 0x00007f6308785173 in vm_call_method (th=0x7f630741c000, cfp=0x7f630752bcc0, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:2172
#21 0x00007f630877e628 in vm_exec_core (th=th@entry=0x7f630741c000, initial=initial@entry=0) at insns.def:995
#22 0x00007f6308783186 in vm_exec (th=0x7f630741c000) at vm.c:1650
#23 0x00007f63087865a8 in rb_iseq_eval (iseq=<optimized out>) at vm.c:1882
#24 0x00007f6308629cf9 in rb_load_internal0 (th=th@entry=0x7f630741c000, fname=140063213754080, wrap=<optimized out>) at load.c:619
---Type <return> to continue, or q <return> to quit---
#25 0x00007f630862a3b6 in rb_load_internal (wrap=<optimized out>, fname=<optimized out>) at load.c:649
#26 rb_f_load (argc=<optimized out>, argv=<optimized out>) at load.c:717
#27 0x00007f6308775250 in vm_call_cfunc_with_frame (ci=<optimized out>, cc=<optimized out>, calling=<optimized out>, reg_cfp=0x7f630752bd40, th=0x7f630741c000) at vm_insnhelper.c:1638
#28 vm_call_cfunc (th=0x7f630741c000, reg_cfp=0x7f630752bd40, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:1733
#29 0x00007f6308784bfe in vm_call_method_each_type (th=th@entry=0x7f630741c000, cfp=cfp@entry=0x7f630752bd40, calling=0x7ffee739d020, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:2022
#30 0x00007f6308785173 in vm_call_method (th=0x7f630741c000, cfp=0x7f630752bd40, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:2172
#31 0x00007f630877e628 in vm_exec_core (th=th@entry=0x7f630741c000, initial=initial@entry=0) at insns.def:995
#32 0x00007f6308783186 in vm_exec (th=0x7f630741c000) at vm.c:1650
#33 0x00007f63087865a8 in rb_iseq_eval (iseq=<optimized out>) at vm.c:1882
#34 0x00007f6308629cf9 in rb_load_internal0 (th=th@entry=0x7f630741c000, fname=140063291110560, wrap=<optimized out>) at load.c:619
#35 0x00007f630862a3b6 in rb_load_internal (wrap=<optimized out>, fname=<optimized out>) at load.c:649
#36 rb_f_load (argc=<optimized out>, argv=<optimized out>) at load.c:717
#37 0x00007f6308775250 in vm_call_cfunc_with_frame (ci=<optimized out>, cc=<optimized out>, calling=<optimized out>, reg_cfp=0x7f630752c080, th=0x7f630741c000) at vm_insnhelper.c:1638
#38 vm_call_cfunc (th=0x7f630741c000, reg_cfp=0x7f630752c080, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:1733
#39 0x00007f6308784bfe in vm_call_method_each_type (th=th@entry=0x7f630741c000, cfp=cfp@entry=0x7f630752c080, calling=0x7ffee739d8f0, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:2022
#40 0x00007f6308785173 in vm_call_method (th=0x7f630741c000, cfp=0x7f630752c080, calling=<optimized out>, ci=<optimized out>, cc=<optimized out>) at vm_insnhelper.c:2172
#41 0x00007f630877e628 in vm_exec_core (th=th@entry=0x7f630741c000, initial=initial@entry=0) at insns.def:995
#42 0x00007f6308783186 in vm_exec (th=0x7f630741c000) at vm.c:1650
#43 0x00007f630878680d in rb_iseq_eval_main (iseq=iseq@entry=0x7f63075a4080) at vm.c:1893
#44 0x00007f6308623f4d in ruby_exec_internal (n=0x7f63075a4080) at eval.c:245
#45 0x00007f630862614d in ruby_exec_node (n=n@entry=0x7f63075a4080) at eval.c:310
#46 0x00007f6308628b8e in ruby_run_node (n=0x7f63075a4080) at eval.c:302
#47 0x000000000040086b in main (argc=14, argv=0x7ffee739e0d8) at main.c:36
/cc: @ahmadsherif, @eReGeBe