Skip to content
Snippets Groups Projects
Commit e4d58c89 authored by Matthias Käppler's avatar Matthias Käppler :two: Committed by Bob Van Landuyt
Browse files

Add watchdog to observe memory fragmentation

We are adding a daemon that observes Ruby heap fragmentation.
If fragmentation exceeds a given level, a handler is invoked.

For now the handler will just swallow the event.
Eventually we hope to use this to reap workers that do not
utilize memory efficiently.

Changelog: added
parent ba070f19
No related branches found
No related tags found
1 merge request!91910Add watchdog to observe memory fragmentation
---
name: enforce_memory_watchdog
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/91910
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/367534
milestone: '15.2'
type: ops
group: group::memory
default_enabled: false
---
name: gitlab_memory_watchdog
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/91910
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/367534
milestone: '15.2'
type: ops
group: group::memory
default_enabled: false
# frozen_string_literal: true
return unless Gitlab::Runtime.application?
return unless Gitlab::Utils.to_boolean(ENV['GITLAB_MEMORY_WATCHDOG_ENABLED'])
Gitlab::Cluster::LifecycleEvents.on_worker_start do
handler =
if Gitlab::Runtime.puma?
Gitlab::Memory::Watchdog::PumaHandler.new
elsif Gitlab::Runtime.sidekiq?
Gitlab::Memory::Watchdog::TermProcessHandler.new
else
Gitlab::Memory::Watchdog::NullHandler.instance
end
Gitlab::Memory::Watchdog.new(
handler: handler, logger: Gitlab::AppLogger
).start
end
# frozen_string_literal: true
module Gitlab
module Memory
# A background thread that observes Ruby heap fragmentation and calls
# into a handler when the Ruby heap has been fragmented for an extended
# period of time.
#
# See Gitlab::Metrics::Memory for how heap fragmentation is defined.
#
# To decide whether a given fragmentation level is being exceeded,
# the watchdog regularly polls the GC. Whenever a violation occurs
# a strike is issued. If the maximum number of strikes are reached,
# a handler is invoked to deal with the situation.
#
# The duration for which a process may be above a given fragmentation
# threshold is computed as `max_strikes * sleep_time_seconds`.
class Watchdog < Daemon
DEFAULT_SLEEP_TIME_SECONDS = 60
DEFAULT_HEAP_FRAG_THRESHOLD = 0.5
DEFAULT_MAX_STRIKES = 5
# This handler does nothing. It returns `false` to indicate to the
# caller that the situation has not been dealt with so it will
# receive calls repeatedly if fragmentation remains high.
#
# This is useful for "dress rehearsals" in production since it allows
# us to observe how frequently the handler is invoked before taking action.
class NullHandler
include Singleton
def on_high_heap_fragmentation(value)
# NOP
false
end
end
# This handler sends SIGTERM and considers the situation handled.
class TermProcessHandler
def initialize(pid = $$)
@pid = pid
end
def on_high_heap_fragmentation(value)
Process.kill(:TERM, @pid)
true
end
end
# This handler invokes Puma's graceful termination handler, which takes
# into account a configurable grace period during which a process may
# remain unresponsive to a SIGTERM.
class PumaHandler
def initialize(puma_options = ::Puma.cli_config.options)
@worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
end
def on_high_heap_fragmentation(value)
@worker.term
true
end
end
# max_heap_fragmentation:
# The degree to which the Ruby heap is allowed to be fragmented. Range [0,1].
# max_strikes:
# How many times the process is allowed to be above max_heap_fragmentation before
# a handler is invoked.
# sleep_time_seconds:
# Used to control the frequency with which the watchdog will wake up and poll the GC.
def initialize(
handler: NullHandler.instance,
logger: Logger.new($stdout),
max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_HEAP_FRAG_THRESHOLD,
max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES,
sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS,
**options)
super(**options)
@handler = handler
@logger = logger
@max_heap_fragmentation = max_heap_fragmentation
@sleep_time_seconds = sleep_time_seconds
@max_strikes = max_strikes
@alive = true
@strikes = 0
init_prometheus_metrics(max_heap_fragmentation)
end
attr_reader :strikes, :max_heap_fragmentation, :max_strikes, :sleep_time_seconds
def run_thread
@logger.info(log_labels.merge(message: 'started'))
while @alive
sleep(@sleep_time_seconds)
monitor_heap_fragmentation if Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
end
@logger.info(log_labels.merge(message: 'stopped'))
end
private
def monitor_heap_fragmentation
heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation
if heap_fragmentation > @max_heap_fragmentation
@strikes += 1
@heap_frag_violations.increment
else
@strikes = 0
end
if @strikes > @max_strikes
# If the handler returns true, it means the event is handled and we can shut down.
@alive = !handle_heap_fragmentation_limit_exceeded(heap_fragmentation)
@strikes = 0
end
end
def handle_heap_fragmentation_limit_exceeded(value)
@logger.warn(
log_labels.merge(
message: 'heap fragmentation limit exceeded',
memwd_cur_heap_frag: value
))
@heap_frag_violations_handled.increment
handler.on_high_heap_fragmentation(value)
end
def handler
# This allows us to keep the watchdog running but turn it into "friendly mode" where
# all that happens is we collect logs and Prometheus events for fragmentation violations.
return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
@handler
end
def stop_working
@alive = false
end
def log_labels
{
pid: $$,
worker_id: worker_id,
memwd_handler_class: handler.class.name,
memwd_sleep_time_s: @sleep_time_seconds,
memwd_max_heap_frag: @max_heap_fragmentation,
memwd_max_strikes: @max_strikes,
memwd_cur_strikes: @strikes,
memwd_rss_bytes: process_rss_bytes
}
end
def worker_id
::Prometheus::PidProvider.worker_id
end
def process_rss_bytes
Gitlab::Metrics::System.memory_usage_rss
end
def init_prometheus_metrics(max_heap_fragmentation)
default_labels = { pid: worker_id }
@heap_frag_limit = Gitlab::Metrics.gauge(
:gitlab_memwd_heap_frag_limit,
'The configured limit for how fragmented the Ruby heap is allowed to be',
default_labels
)
@heap_frag_limit.set({}, max_heap_fragmentation)
@heap_frag_violations = Gitlab::Metrics.counter(
:gitlab_memwd_heap_frag_violations_total,
'Total number of times heap fragmentation in a Ruby process exceeded its allowed maximum',
default_labels
)
@heap_frag_violations_handled = Gitlab::Metrics.counter(
:gitlab_memwd_heap_frag_violations_handled_total,
'Total number of times heap fragmentation violations in a Ruby process were handled',
default_labels
)
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do
context 'watchdog' do
let(:logger) { instance_double(::Logger) }
let(:handler) { instance_double(described_class::NullHandler) }
let(:heap_frag_limit_gauge) { instance_double(::Prometheus::Client::Gauge) }
let(:heap_frag_violations_counter) { instance_double(::Prometheus::Client::Counter) }
let(:heap_frag_violations_handled_counter) { instance_double(::Prometheus::Client::Counter) }
let(:sleep_time) { 0.1 }
let(:max_heap_fragmentation) { 0.2 }
subject(:watchdog) do
described_class.new(handler: handler, logger: logger, sleep_time_seconds: sleep_time,
max_strikes: max_strikes, max_heap_fragmentation: max_heap_fragmentation)
end
before do
allow(handler).to receive(:on_high_heap_fragmentation).and_return(true)
allow(logger).to receive(:warn)
allow(logger).to receive(:info)
allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(fragmentation)
end
after do
watchdog.stop
end
context 'when starting up' do
let(:fragmentation) { 0 }
let(:max_strikes) { 0 }
it 'sets the heap fragmentation limit gauge' do
allow(Gitlab::Metrics).to receive(:gauge).and_return(heap_frag_limit_gauge)
expect(heap_frag_limit_gauge).to receive(:set).with({}, max_heap_fragmentation)
end
context 'when no settings are set in the environment' do
it 'initializes with defaults' do
watchdog = described_class.new(handler: handler, logger: logger)
expect(watchdog.max_heap_fragmentation).to eq(described_class::DEFAULT_HEAP_FRAG_THRESHOLD)
expect(watchdog.max_strikes).to eq(described_class::DEFAULT_MAX_STRIKES)
expect(watchdog.sleep_time_seconds).to eq(described_class::DEFAULT_SLEEP_TIME_SECONDS)
end
end
context 'when settings are passed through the environment' do
before do
stub_env('GITLAB_MEMWD_MAX_HEAP_FRAG', 1)
stub_env('GITLAB_MEMWD_MAX_STRIKES', 2)
stub_env('GITLAB_MEMWD_SLEEP_TIME_SEC', 3)
end
it 'initializes with these settings' do
watchdog = described_class.new(handler: handler, logger: logger)
expect(watchdog.max_heap_fragmentation).to eq(1)
expect(watchdog.max_strikes).to eq(2)
expect(watchdog.sleep_time_seconds).to eq(3)
end
end
end
context 'when process does not exceed heap fragmentation threshold' do
let(:fragmentation) { max_heap_fragmentation - 0.1 }
let(:max_strikes) { 0 } # To rule out that we were granting too many strikes.
it 'does not signal the handler' do
expect(handler).not_to receive(:on_high_heap_fragmentation)
watchdog.start
sleep sleep_time * 3
end
end
context 'when process exceeds heap fragmentation threshold permanently' do
let(:fragmentation) { max_heap_fragmentation + 0.1 }
before do
allow(Gitlab::Metrics).to receive(:counter)
.with(:gitlab_memwd_heap_frag_violations_total, anything, anything)
.and_return(heap_frag_violations_counter)
allow(Gitlab::Metrics).to receive(:counter)
.with(:gitlab_memwd_heap_frag_violations_handled_total, anything, anything)
.and_return(heap_frag_violations_handled_counter)
allow(heap_frag_violations_counter).to receive(:increment)
allow(heap_frag_violations_handled_counter).to receive(:increment)
end
context 'when process has not exceeded allowed number of strikes' do
let(:max_strikes) { 10 }
it 'does not signal the handler' do
expect(handler).not_to receive(:on_high_heap_fragmentation)
watchdog.start
sleep sleep_time * 3
end
it 'does not log any events' do
expect(logger).not_to receive(:warn)
watchdog.start
sleep sleep_time * 3
end
it 'increments the violations counter' do
expect(heap_frag_violations_counter).to receive(:increment)
watchdog.start
sleep sleep_time * 3
end
it 'does not increment violations handled counter' do
expect(heap_frag_violations_handled_counter).not_to receive(:increment)
watchdog.start
sleep sleep_time * 3
end
end
context 'when process exceeds the allowed number of strikes' do
let(:max_strikes) { 1 }
it 'signals the handler and resets strike counter' do
expect(handler).to receive(:on_high_heap_fragmentation).and_return(true)
watchdog.start
sleep sleep_time * 3
expect(watchdog.strikes).to eq(0)
end
it 'logs the event' do
expect(::Prometheus::PidProvider).to receive(:worker_id).at_least(:once).and_return('worker_1')
expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024)
expect(logger).to receive(:warn).with({
message: 'heap fragmentation limit exceeded',
pid: Process.pid,
worker_id: 'worker_1',
memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble',
memwd_sleep_time_s: sleep_time,
memwd_max_heap_frag: max_heap_fragmentation,
memwd_cur_heap_frag: fragmentation,
memwd_max_strikes: max_strikes,
memwd_cur_strikes: max_strikes + 1,
memwd_rss_bytes: 1024
})
watchdog.start
sleep sleep_time * 3
end
it 'increments both the violations and violations handled counters' do
expect(heap_frag_violations_counter).to receive(:increment)
expect(heap_frag_violations_handled_counter).to receive(:increment)
watchdog.start
sleep sleep_time * 3
end
context 'when enforce_memory_watchdog ops toggle is off' do
before do
stub_feature_flags(enforce_memory_watchdog: false)
end
it 'always uses the NullHandler' do
expect(handler).not_to receive(:on_high_heap_fragmentation)
expect(described_class::NullHandler.instance).to(
receive(:on_high_heap_fragmentation).with(fragmentation).and_return(true)
)
watchdog.start
sleep sleep_time * 3
end
end
end
context 'when handler result is true' do
let(:max_strikes) { 1 }
it 'considers the event handled and stops itself' do
expect(handler).to receive(:on_high_heap_fragmentation).once.and_return(true)
watchdog.start
sleep sleep_time * 3
end
end
context 'when handler result is false' do
let(:max_strikes) { 1 }
it 'keeps running' do
# Return true the third time to terminate the daemon.
expect(handler).to receive(:on_high_heap_fragmentation).and_return(false, false, true)
watchdog.start
sleep sleep_time * 4
end
end
end
context 'when process exceeds heap fragmentation threshold temporarily' do
let(:fragmentation) { max_heap_fragmentation }
let(:max_strikes) { 1 }
before do
allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(
fragmentation - 0.1,
fragmentation + 0.2,
fragmentation - 0.1,
fragmentation + 0.1
)
end
it 'does not signal the handler' do
expect(handler).not_to receive(:on_high_heap_fragmentation)
watchdog.start
sleep sleep_time * 4
end
end
context 'when gitlab_memory_watchdog ops toggle is off' do
let(:fragmentation) { 0 }
let(:max_strikes) { 0 }
before do
stub_feature_flags(gitlab_memory_watchdog: false)
end
it 'does not monitor heap fragmentation' do
expect(Gitlab::Metrics::Memory).not_to receive(:gc_heap_fragmentation)
watchdog.start
sleep sleep_time * 3
end
end
end
context 'handlers' do
context 'NullHandler' do
subject(:handler) { described_class::NullHandler.instance }
describe '#on_high_heap_fragmentation' do
it 'does nothing' do
expect(handler.on_high_heap_fragmentation(1.0)).to be(false)
end
end
end
context 'TermProcessHandler' do
subject(:handler) { described_class::TermProcessHandler.new(42) }
describe '#on_high_heap_fragmentation' do
it 'sends SIGTERM to the current process' do
expect(Process).to receive(:kill).with(:TERM, 42)
expect(handler.on_high_heap_fragmentation(1.0)).to be(true)
end
end
end
context 'PumaHandler' do
# rubocop: disable RSpec/VerifiedDoubles
# In tests, the Puma constant is not loaded so we cannot make this an instance_double.
let(:puma_worker_handle_class) { double('Puma::Cluster::WorkerHandle') }
let(:puma_worker_handle) { double('worker') }
# rubocop: enable RSpec/VerifiedDoubles
subject(:handler) { described_class::PumaHandler.new({}) }
before do
stub_const('::Puma::Cluster::WorkerHandle', puma_worker_handle_class)
end
describe '#on_high_heap_fragmentation' do
it 'invokes orderly termination via Puma API' do
expect(puma_worker_handle_class).to receive(:new).and_return(puma_worker_handle)
expect(puma_worker_handle).to receive(:term)
expect(handler.on_high_heap_fragmentation(1.0)).to be(true)
end
end
end
end
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment