Commit 4da03e99 authored by Grzegorz Bizon's avatar Grzegorz Bizon 🔴

Merge branch 'pawel/metrics-to-prometheus-33643' into 'master'

Add all InfluxDB metrics to prometheus

See merge request gitlab-org/gitlab-ce!13891
parents c3d15fa6 5a085dc1
---
title: Add Prometheus equivalent of all InfluxDB metrics
merge_request: 13891
author:
type: changed
......@@ -535,6 +535,7 @@ Settings.webpack.dev_server['port'] ||= 3808
Settings['monitoring'] ||= Settingslogic.new({})
Settings.monitoring['ip_whitelist'] ||= ['127.0.0.1/8']
Settings.monitoring['unicorn_sampler_interval'] ||= 10
Settings.monitoring['ruby_sampler_interval'] ||= 60
Settings.monitoring['sidekiq_exporter'] ||= Settingslogic.new({})
Settings.monitoring.sidekiq_exporter['enabled'] ||= false
Settings.monitoring.sidekiq_exporter['address'] ||= 'localhost'
......
......@@ -11,7 +11,15 @@ Prometheus::Client.configure do |config|
config.multiprocess_files_dir ||= Rails.root.join('tmp/prometheus_multiproc_dir')
end
config.pid_provider = Prometheus::Client::Support::Unicorn.method(:worker_pid_provider)
config.pid_provider = -> do
wid = Prometheus::Client::Support::Unicorn.worker_id
wid = Process.pid if wid.nil?
if wid.nil?
"process_pid_#{Process.pid}"
else
"worker_id_#{wid}"
end
end
end
Sidekiq.configure_server do |config|
......@@ -19,3 +27,11 @@ Sidekiq.configure_server do |config|
Gitlab::Metrics::SidekiqMetricsExporter.instance.start
end
end
if Gitlab::Metrics.prometheus_metrics_enabled?
unless Sidekiq.server?
Gitlab::Metrics::Samplers::UnicornSampler.initialize_instance(Settings.monitoring.unicorn_sampler_interval).start
end
Gitlab::Metrics::Samplers::RubySampler.initialize_instance(Settings.monitoring.ruby_sampler_interval).start
end
......@@ -118,10 +118,6 @@ def instrument_classes(instrumentation)
end
# rubocop:enable Metrics/AbcSize
unless Sidekiq.server?
Gitlab::Metrics::UnicornSampler.initialize_instance(Settings.monitoring.unicorn_sampler_interval).start
end
Gitlab::Application.configure do |config|
# 0 should be Sentry to catch errors in this middleware
config.middleware.insert(1, Gitlab::Metrics::RequestsRackMiddleware)
......@@ -187,7 +183,7 @@ if Gitlab::Metrics.enabled?
GC::Profiler.enable
Gitlab::Metrics::InfluxSampler.initialize_instance.start
Gitlab::Metrics::Samplers::InfluxSampler.initialize_instance.start
module TrackNewRedisConnections
def connect(*args)
......
......@@ -43,7 +43,7 @@ module Gitlab
if thread
thread.wakeup if thread.alive?
thread.join
thread.join unless Thread.current == thread
@thread = nil
end
end
......
module Gitlab
module Metrics
class BackgroundTransaction < Transaction
def initialize(worker_class)
super()
@worker_class = worker_class
end
protected
def labels
{ controller: @worker_class.name, action: 'perform' }
end
end
end
end
require 'logger'
module Gitlab
module Metrics
class BaseSampler < Daemon
# interval - The sampling interval in seconds.
def initialize(interval)
interval_half = interval.to_f / 2
@interval = interval
@interval_steps = (-interval_half..interval_half).step(0.1).to_a
super()
end
def safe_sample
sample
rescue => e
Rails.logger.warn("#{self.class}: #{e}, stopping")
stop
end
def sample
raise NotImplementedError
end
# Returns the sleep interval with a random adjustment.
#
# The random adjustment is put in place to ensure we:
#
# 1. Don't generate samples at the exact same interval every time (thus
# potentially missing anything that happens in between samples).
# 2. Don't sample data at the same interval two times in a row.
def sleep_interval
while (step = @interval_steps.sample)
if step != @last_step
@last_step = step
return @interval + @last_step
end
end
end
private
attr_reader :running
def start_working
@running = true
sleep(sleep_interval)
while running
safe_sample
sleep(sleep_interval)
end
end
def stop_working
@running = false
end
end
end
end
......@@ -11,6 +11,8 @@ module Gitlab
settings[:enabled] || false
end
# Prometheus histogram buckets used for arbitrary code measurements
EXECUTION_MEASUREMENT_BUCKETS = [0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1].freeze
RAILS_ROOT = Rails.root.to_s
METRICS_ROOT = Rails.root.join('lib', 'gitlab', 'metrics').to_s
PATH_REGEX = /^#{RAILS_ROOT}\/?/
......@@ -99,24 +101,27 @@ module Gitlab
cpu_stop = System.cpu_time
real_stop = Time.now.to_f
real_time = (real_stop - real_start) * 1000.0
real_time = (real_stop - real_start)
cpu_time = cpu_stop - cpu_start
trans.increment("#{name}_real_time", real_time)
trans.increment("#{name}_cpu_time", cpu_time)
trans.increment("#{name}_call_count", 1)
Gitlab::Metrics.histogram("gitlab_#{name}_real_duration_seconds".to_sym,
"Measure #{name}",
Transaction::BASE_LABELS,
EXECUTION_MEASUREMENT_BUCKETS)
.observe(trans.labels, real_time)
retval
end
Gitlab::Metrics.histogram("gitlab_#{name}_cpu_duration_seconds".to_sym,
"Measure #{name}",
Transaction::BASE_LABELS,
EXECUTION_MEASUREMENT_BUCKETS)
.observe(trans.labels, cpu_time / 1000.0)
# Adds a tag to the current transaction (if any)
#
# name - The name of the tag to add.
# value - The value of the tag.
def tag_transaction(name, value)
trans = current_transaction
# InfluxDB stores the _real_time time values as milliseconds
trans.increment("#{name}_real_time", real_time * 1000, false)
trans.increment("#{name}_cpu_time", cpu_time, false)
trans.increment("#{name}_call_count", 1, false)
trans&.add_tag(name, value)
retval
end
# Sets the action of the current transaction (if any)
......
module Gitlab
module Metrics
# Class that sends certain metrics to InfluxDB at a specific interval.
#
# This class is used to gather statistics that can't be directly associated
# with a transaction such as system memory usage, garbage collection
# statistics, etc.
class InfluxSampler < BaseSampler
# interval - The sampling interval in seconds.
def initialize(interval = Metrics.settings[:sample_interval])
super(interval)
@last_step = nil
@metrics = []
@last_minor_gc = Delta.new(GC.stat[:minor_gc_count])
@last_major_gc = Delta.new(GC.stat[:major_gc_count])
if Gitlab::Metrics.mri?
require 'allocations'
Allocations.start
end
end
def sample
sample_memory_usage
sample_file_descriptors
sample_objects
sample_gc
flush
ensure
GC::Profiler.clear
@metrics.clear
end
def flush
Metrics.submit_metrics(@metrics.map(&:to_hash))
end
def sample_memory_usage
add_metric('memory_usage', value: System.memory_usage)
end
def sample_file_descriptors
add_metric('file_descriptors', value: System.file_descriptor_count)
end
if Metrics.mri?
def sample_objects
sample = Allocations.to_hash
counts = sample.each_with_object({}) do |(klass, count), hash|
name = klass.name
next unless name
hash[name] = count
end
# Symbols aren't allocated so we'll need to add those manually.
counts['Symbol'] = Symbol.all_symbols.length
counts.each do |name, count|
add_metric('object_counts', { count: count }, type: name)
end
end
else
def sample_objects
end
end
def sample_gc
time = GC::Profiler.total_time * 1000.0
stats = GC.stat.merge(total_time: time)
# We want the difference of GC runs compared to the last sample, not the
# total amount since the process started.
stats[:minor_gc_count] =
@last_minor_gc.compared_with(stats[:minor_gc_count])
stats[:major_gc_count] =
@last_major_gc.compared_with(stats[:major_gc_count])
stats[:count] = stats[:minor_gc_count] + stats[:major_gc_count]
add_metric('gc_statistics', stats)
end
def add_metric(series, values, tags = {})
prefix = sidekiq? ? 'sidekiq_' : 'rails_'
@metrics << Metric.new("#{prefix}#{series}", values, tags)
end
def sidekiq?
Sidekiq.server?
end
end
end
end
......@@ -118,19 +118,21 @@ module Gitlab
def self.instrument(type, mod, name)
return unless Metrics.enabled?
name = name.to_sym
name = name.to_sym
target = type == :instance ? mod : mod.singleton_class
if type == :instance
target = mod
label = "#{mod.name}##{name}"
method_name = "##{name}"
method = mod.instance_method(name)
else
target = mod.singleton_class
label = "#{mod.name}.#{name}"
method_name = ".#{name}"
method = mod.method(name)
end
label = "#{mod.name}#{method_name}"
unless instrumented?(target)
target.instance_variable_set(PROXY_IVAR, Module.new)
end
......@@ -153,7 +155,8 @@ module Gitlab
proxy_module.class_eval <<-EOF, __FILE__, __LINE__ + 1
def #{name}(#{args_signature})
if trans = Gitlab::Metrics::Instrumentation.transaction
trans.method_call_for(#{label.to_sym.inspect}).measure { super }
trans.method_call_for(#{label.to_sym.inspect}, #{mod.name.inspect}, "#{method_name}")
.measure { super }
else
super
end
......
......@@ -2,15 +2,45 @@ module Gitlab
module Metrics
# Class for tracking timing information about method calls
class MethodCall
attr_reader :real_time, :cpu_time, :call_count
MUTEX = Mutex.new
BASE_LABELS = { module: nil, method: nil }.freeze
attr_reader :real_time, :cpu_time, :call_count, :labels
def self.call_real_duration_histogram
return @call_real_duration_histogram if @call_real_duration_histogram
MUTEX.synchronize do
@call_real_duration_histogram ||= Gitlab::Metrics.histogram(
:gitlab_method_call_real_duration_seconds,
'Method calls real duration',
Transaction::BASE_LABELS.merge(BASE_LABELS),
[0.1, 0.2, 0.5, 1, 2, 5, 10]
)
end
end
def self.call_cpu_duration_histogram
return @call_cpu_duration_histogram if @call_cpu_duration_histogram
MUTEX.synchronize do
@call_duration_histogram ||= Gitlab::Metrics.histogram(
:gitlab_method_call_cpu_duration_seconds,
'Method calls cpu duration',
Transaction::BASE_LABELS.merge(BASE_LABELS),
[0.1, 0.2, 0.5, 1, 2, 5, 10]
)
end
end
# name - The full name of the method (including namespace) such as
# `User#sign_in`.
#
# series - The series to use for storing the data.
def initialize(name, series)
def initialize(name, module_name, method_name, transaction)
@module_name = module_name
@method_name = method_name
@transaction = transaction
@name = name
@series = series
@labels = { module: @module_name, method: @method_name }
@real_time = 0
@cpu_time = 0
@call_count = 0
......@@ -22,21 +52,27 @@ module Gitlab
start_cpu = System.cpu_time
retval = yield
@real_time += System.monotonic_time - start_real
@cpu_time += System.cpu_time - start_cpu
real_time = System.monotonic_time - start_real
cpu_time = System.cpu_time - start_cpu
@real_time += real_time
@cpu_time += cpu_time
@call_count += 1
self.class.call_real_duration_histogram.observe(@transaction.labels.merge(labels), real_time / 1000.0)
self.class.call_cpu_duration_histogram.observe(@transaction.labels.merge(labels), cpu_time / 1000.0)
retval
end
# Returns a Metric instance of the current method call.
def to_metric
Metric.new(
@series,
Instrumentation.series,
{
duration: real_time,
duration: real_time,
cpu_duration: cpu_time,
call_count: call_count
call_count: call_count
},
method: @name
)
......
......@@ -5,6 +5,9 @@ module Gitlab
module Prometheus
include Gitlab::CurrentSettings
REGISTRY_MUTEX = Mutex.new
PROVIDER_MUTEX = Mutex.new
def metrics_folder_present?
multiprocess_files_dir = ::Prometheus::Client.configuration.multiprocess_files_dir
......@@ -20,23 +23,38 @@ module Gitlab
end
def registry
@registry ||= ::Prometheus::Client.registry
return @registry if @registry
REGISTRY_MUTEX.synchronize do
@registry ||= ::Prometheus::Client.registry
end
end
def counter(name, docstring, base_labels = {})
provide_metric(name) || registry.counter(name, docstring, base_labels)
safe_provide_metric(:counter, name, docstring, base_labels)
end
def summary(name, docstring, base_labels = {})
provide_metric(name) || registry.summary(name, docstring, base_labels)
safe_provide_metric(:summary, name, docstring, base_labels)
end
def gauge(name, docstring, base_labels = {}, multiprocess_mode = :all)
provide_metric(name) || registry.gauge(name, docstring, base_labels, multiprocess_mode)
safe_provide_metric(:gauge, name, docstring, base_labels, multiprocess_mode)
end
def histogram(name, docstring, base_labels = {}, buckets = ::Prometheus::Client::Histogram::DEFAULT_BUCKETS)
provide_metric(name) || registry.histogram(name, docstring, base_labels, buckets)
safe_provide_metric(:histogram, name, docstring, base_labels, buckets)
end
private
def safe_provide_metric(method, name, *args)
metric = provide_metric(name)
return metric if metric
PROVIDER_MUTEX.synchronize do
provide_metric(name) || registry.method(method).call(name, *args)
end
end
def provide_metric(name)
......@@ -47,8 +65,6 @@ module Gitlab
end
end
private
def prometheus_metrics_enabled_unmemoized
metrics_folder_present? && current_application_settings[:prometheus_metrics_enabled] || false
end
......
......@@ -2,20 +2,6 @@ module Gitlab
module Metrics
# Rack middleware for tracking Rails and Grape requests.
class RackMiddleware
CONTROLLER_KEY = 'action_controller.instance'.freeze
ENDPOINT_KEY = 'api.endpoint'.freeze
CONTENT_TYPES = {
'text/html' => :html,
'text/plain' => :txt,
'application/json' => :json,
'text/js' => :js,
'application/atom+xml' => :atom,
'image/png' => :png,
'image/jpeg' => :jpeg,
'image/gif' => :gif,
'image/svg+xml' => :svg
}.freeze
def initialize(app)
@app = app
end
......@@ -35,12 +21,6 @@ module Gitlab
# Even in the event of an error we want to submit any metrics we
# might've gathered up to this point.
ensure
if env[CONTROLLER_KEY]
tag_controller(trans, env)
elsif env[ENDPOINT_KEY]
tag_endpoint(trans, env)
end
trans.finish
end
......@@ -48,60 +28,19 @@ module Gitlab
end
def transaction_from_env(env)
trans = Transaction.new
trans = WebTransaction.new(env)
trans.set(:request_uri, filtered_path(env))
trans.set(:request_method, env['REQUEST_METHOD'])
trans.set(:request_uri, filtered_path(env), false)
trans.set(:request_method, env['REQUEST_METHOD'], false)
trans
end
def tag_controller(trans, env)
controller = env[CONTROLLER_KEY]
action = "#{controller.class.name}##{controller.action_name}"
suffix = CONTENT_TYPES[controller.content_type]
if suffix && suffix != :html
action += ".#{suffix}"
end
trans.action = action
end
def tag_endpoint(trans, env)
endpoint = env[ENDPOINT_KEY]
begin
route = endpoint.route
rescue
# endpoint.route is calling env[Grape::Env::GRAPE_ROUTING_ARGS][:route_info]
# but env[Grape::Env::GRAPE_ROUTING_ARGS] is nil in the case of a 405 response
# so we're rescuing exceptions and bailing out
end
if route
path = endpoint_paths_cache[route.request_method][route.path]
trans.action = "Grape##{route.request_method} #{path}"
end
end
private
def filtered_path(env)
ActionDispatch::Request.new(env).filtered_path.presence || env['REQUEST_URI']
end
def endpoint_paths_cache
@endpoint_paths_cache ||= Hash.new do |hash, http_method|
hash[http_method] = Hash.new do |inner_hash, raw_path|
inner_hash[raw_path] = endpoint_instrumentable_path(raw_path)
end
end
end
def endpoint_instrumentable_path(raw_path)
raw_path.sub('(.:format)', '').sub('/:version', '')
end
end
end
end
require 'logger'
module Gitlab
module Metrics
module Samplers
class BaseSampler < Daemon
# interval - The sampling interval in seconds.
def initialize(interval)
interval_half = interval.to_f / 2
@interval = interval
@interval_steps = (-interval_half..interval_half).step(0.1).to_a
super()
end
def safe_sample
sample
rescue => e
Rails.logger.warn("#{self.class}: #{e}, stopping")
stop
end
def sample
raise NotImplementedError
end
# Returns the sleep interval with a random adjustment.
#
# The random adjustment is put in place to ensure we:
#
# 1. Don't generate samples at the exact same interval every time (thus
# potentially missing anything that happens in between samples).
# 2. Don't sample data at the same interval two times in a row.
def sleep_interval
while step = @interval_steps.sample
if step != @last_step
@last_step = step
return @interval + @last_step
end
end
end
private
attr_reader :running
def start_working
@running = true
sleep(sleep_interval)
while running
safe_sample
sleep(sleep_interval)
end
end
def stop_working
@running = false
end
end
end
end
end
module Gitlab
module Metrics
module Samplers
# Class that sends certain metrics to InfluxDB at a specific interval.
#
# This class is used to gather statistics that can't be directly associated
# with a transaction such as system memory usage, garbage collection
# statistics, etc.
class InfluxSampler < BaseSampler
# interval - The sampling interval in seconds.
def initialize(interval = Metrics.settings[:sample_interval])
super(interval)
@last_step = nil
@metrics = []
@last_minor_gc = Delta.new(GC.stat[:minor_gc_count])
@last_major_gc = Delta.new(GC.stat[:major_gc_count])
if Gitlab::Metrics.mri?
require 'allocations'
Allocations.start
end
end
def sample
sample_memory_usage
sample_file_descriptors
sample_objects
sample_gc
flush
ensure
GC::Profiler.clear
@metrics.clear
end
def flush
Metrics.submit_metrics(@metrics.map(&:to_hash))
end