Commit 30b4ce94 authored by Zeger-Jan van de Weg's avatar Zeger-Jan van de Weg 🌴

Remove Git circuit breaker

Was introduced in the time that GitLab still used NFS, which is not
required anymore in most cases. By removing this, the API it calls will
return empty responses. This interface has to be removed in the next
major release, expected to be 12.0.
parent 550f5574
......@@ -3,12 +3,5 @@
class Admin::HealthCheckController < Admin::ApplicationController
def show
@errors = HealthCheck::Utils.process_checks(['standard'])
@failing_storage_statuses = Gitlab::Git::Storage::Health.for_failing_storages
end
def reset_storage_health
Gitlab::Git::Storage::FailureInfo.reset_all!
redirect_to admin_health_check_path,
notice: _('Git storage health information has been reset')
end
end
......@@ -66,7 +66,7 @@ class ApplicationController < ActionController::Base
head :forbidden, retry_after: Gitlab::Auth::UniqueIpsLimiter.config.unique_ips_limit_time_window
end
rescue_from Gitlab::Git::Storage::Inaccessible, GRPC::Unavailable, Gitlab::Git::CommandError do |exception|
rescue_from GRPC::Unavailable, Gitlab::Git::CommandError do |exception|
log_exception(exception)
headers['Retry-After'] = exception.retry_after if exception.respond_to?(:retry_after)
......
# frozen_string_literal: true
class HealthController < ActionController::Base
protect_from_forgery with: :exception, except: :storage_check, prepend: true
protect_from_forgery with: :exception, prepend: true
include RequiresWhitelistedMonitoringClient
CHECKS = [
......@@ -25,15 +25,6 @@ class HealthController < ActionController::Base
render_check_results(results)
end
def storage_check
results = Gitlab::Git::Storage::Checker.check_all
render json: {
check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval,
results: results
}
end
private
def render_check_results(results)
......
......@@ -108,37 +108,6 @@ module ApplicationSettingsHelper
options_for_select(options, selected)
end
def circuitbreaker_failure_count_help_text
health_link = link_to(s_('AdminHealthPageLink|health page'), admin_health_check_path)
api_link = link_to(s_('CircuitBreakerApiLink|circuitbreaker api'), help_page_path("api/repository_storage_health"))
message = _("The number of failures of after which GitLab will completely "\
"prevent access to the storage. The number of failures can be "\
"reset in the admin interface: %{link_to_health_page} or using "\
"the %{api_documentation_link}.")
message = message % { link_to_health_page: health_link, api_documentation_link: api_link }
message.html_safe
end
def circuitbreaker_access_retries_help_text
_('The number of attempts GitLab will make to access a storage.')
end
def circuitbreaker_failure_reset_time_help_text
_("The time in seconds GitLab will keep failure information. When no "\
"failures occur during this time, information about the mount is reset.")
end
def circuitbreaker_storage_timeout_help_text
_("The time in seconds GitLab will try to access storage. After this time a "\
"timeout error will be raised.")
end
def circuitbreaker_check_interval_help_text
_("The time in seconds between storage checks. When a previous check did "\
"complete yet, GitLab will skip a check.")
end
def visible_attributes
[
:admin_notification_email,
......@@ -150,11 +119,6 @@ module ApplicationSettingsHelper
:authorized_keys_enabled,
:auto_devops_enabled,
:auto_devops_domain,
:circuitbreaker_access_retries,
:circuitbreaker_check_interval,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
:clientside_sentry_dsn,
:clientside_sentry_enabled,
:container_registry_token_expire_delay,
......
# frozen_string_literal: true
module StorageHealthHelper
def failing_storage_health_message(storage_health)
storage_name = content_tag(:strong, h(storage_health.storage_name))
host_names = h(storage_health.failing_on_hosts.to_sentence)
translation_params = { storage_name: storage_name,
host_names: host_names,
failed_attempts: storage_health.total_failures }
translation = n_('%{storage_name}: failed storage access attempt on host:',
'%{storage_name}: %{failed_attempts} failed storage access attempts:',
storage_health.total_failures) % translation_params
translation.html_safe
end
def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures }
if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
else
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"allow access on the next attempt.") % translation_params
end
end
end
......@@ -4,6 +4,7 @@ class ApplicationSetting < ActiveRecord::Base
include CacheableAttributes
include CacheMarkdownField
include TokenAuthenticatable
include IgnorableColumn
add_authentication_token_field :runners_registration_token
add_authentication_token_field :health_check_access_token
......@@ -27,6 +28,12 @@ class ApplicationSetting < ActiveRecord::Base
serialize :domain_blacklist, Array # rubocop:disable Cop/ActiveRecordSerialize
serialize :repository_storages # rubocop:disable Cop/ActiveRecordSerialize
ignore_column :circuitbreaker_failure_count_threshold
ignore_column :circuitbreaker_failure_reset_time
ignore_column :circuitbreaker_storage_timeout
ignore_column :circuitbreaker_access_retries
ignore_column :circuitbreaker_check_interval
cache_markdown_field :sign_in_text
cache_markdown_field :help_page_text
cache_markdown_field :shared_runners_text, pipeline: :plain_markdown
......@@ -150,17 +157,6 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
:circuitbreaker_check_interval,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
validates :circuitbreaker_access_retries,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 }
validates :gitaly_timeout_default,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
......
......@@ -20,32 +20,5 @@
Manage repository storage paths. Learn more in the
= succeed "." do
= link_to "repository storages documentation", help_page_path("administration/repository_storage_paths")
.sub-section
%h4 Circuit breaker
.form-group
= f.label :circuitbreaker_check_interval, _('Check interval'), class: 'label-bold'
= f.number_field :circuitbreaker_check_interval, class: 'form-control'
.form-text.text-muted
= circuitbreaker_check_interval_help_text
.form-group
= f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'label-bold'
= f.number_field :circuitbreaker_access_retries, class: 'form-control'
.form-text.text-muted
= circuitbreaker_access_retries_help_text
.form-group
= f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'label-bold'
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.form-text.text-muted
= circuitbreaker_storage_timeout_help_text
.form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'label-bold'
= f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control'
.form-text.text-muted
= circuitbreaker_failure_count_help_text
.form-group
= f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'label-bold'
= f.number_field :circuitbreaker_failure_reset_time, class: 'form-control'
.form-text.text-muted
= circuitbreaker_failure_reset_time_help_text
= f.submit 'Save changes', class: "btn btn-success qa-save-changes-button"
......@@ -20,7 +20,7 @@
%button.btn.btn-default.js-settings-toggle{ type: 'button' }
= expanded_by_default? ? _('Collapse') : _('Expand')
%p
= _('Configure storage path and circuit breaker settings.')
= _('Configure storage path settings.')
.settings-content
= render 'repository_storage'
......
- if failing_storages.any?
= _('There are problems accessing Git storage: ')
%ul
- failing_storages.each do |storage_health|
%li
= failing_storage_health_message(storage_health)
%ul
- storage_health.failing_circuit_breakers.each do |circuit_breaker|
%li
#{circuit_breaker.hostname}: #{message_for_circuit_breaker(circuit_breaker)}
= _("Access to failing storages has been temporarily disabled to allow the mount to recover. Reset storage information after the issue has been resolved to allow access again.")
.prepend-top-10
= button_to _("Reset git storage health information"), reset_storage_health_admin_health_check_path,
method: :post, class: 'btn btn-default'
- @no_container = true
- page_title _('Health Check')
- no_errors = @errors.blank? && @failing_storage_statuses.blank?
- no_errors = @errors.blank?
%div{ class: container_class }
%h3.page-title= page_title
......@@ -39,4 +39,3 @@
#{ s_('HealthCheck|No Health Problems Detected') }
- else
= @errors
= render partial: 'failing_storages', object: @failing_storage_statuses
#!/usr/bin/env ruby
require 'optparse'
require 'net/http'
require 'json'
require 'socket'
require 'logger'
require_relative '../lib/gitlab/storage_check'
Gitlab::StorageCheck::CLI.start!(ARGV)
---
title: Remove Git circuit breaker
merge_request: 22212
author:
type: removed
......@@ -56,7 +56,6 @@ Rails.application.routes.draw do
# '/-/health' implemented by BasicHealthMiddleware
get 'liveness' => 'health#liveness'
get 'readiness' => 'health#readiness'
post 'storage_check' => 'health#storage_check'
resources :metrics, only: [:index]
mount Peek::Railtie => '/peek', as: 'peek_routes'
......
......@@ -69,9 +69,7 @@ namespace :admin do
end
resource :logs, only: [:show]
resource :health_check, controller: 'health_check', only: [:show] do
post :reset_storage_health
end
resource :health_check, controller: 'health_check', only: [:show]
resource :background_jobs, controller: 'background_jobs', only: [:show]
resource :system_info, controller: 'system_info', only: [:show]
resources :requests_profiles, only: [:index, :show], param: :name, constraints: { name: /.+\.html/ }
......
# frozen_string_literal: true
class RemoveCircuitBreaker < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
disable_ddl_transaction!
CIRCUIT_BREAKER_COLUMS_WITH_DEFAULT = {
circuitbreaker_failure_count_threshold: 3,
circuitbreaker_failure_reset_time: 1800,
circuitbreaker_storage_timeout: 15,
circuitbreaker_access_retries: 3,
circuitbreaker_check_interval: 1
}.freeze
def up
CIRCUIT_BREAKER_COLUMS_WITH_DEFAULT.keys.each do |column|
remove_column(:application_settings, column) if column_exists?(:application_settings, column)
end
end
def down
CIRCUIT_BREAKER_COLUMS_WITH_DEFAULT.each do |column, default|
add_column_with_default(:application_settings, column, :integer, default: default) unless column_exists?(:application_settings, column)
end
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20181008145359) do
ActiveRecord::Schema.define(version: 20181008200441) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -139,10 +139,6 @@ ActiveRecord::Schema.define(version: 20181008145359) do
t.boolean "hashed_storage_enabled", default: false, null: false
t.boolean "project_export_enabled", default: true, null: false
t.boolean "auto_devops_enabled", default: true, null: false
t.integer "circuitbreaker_failure_count_threshold", default: 3
t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 15
t.integer "circuitbreaker_access_retries", default: 3
t.boolean "throttle_unauthenticated_enabled", default: false, null: false
t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false
t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false
......@@ -152,7 +148,6 @@ ActiveRecord::Schema.define(version: 20181008145359) do
t.boolean "throttle_authenticated_web_enabled", default: false, null: false
t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false
t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false
t.integer "circuitbreaker_check_interval", default: 1, null: false
t.boolean "password_authentication_enabled_for_web"
t.boolean "password_authentication_enabled_for_git", default: true
t.integer "gitaly_timeout_default", default: 55, null: false
......
......@@ -45,9 +45,6 @@ The following metrics are available:
| redis_ping_success | Gauge | 9.4 | Whether or not the last redis ping succeeded |
| redis_ping_latency_seconds | Gauge | 9.4 | Round trip time of the redis ping |
| user_session_logins_total | Counter | 9.4 | Counter of how many users have logged in |
| filesystem_circuitbreaker_latency_seconds | Gauge | 9.5 | Time spent validating if a storage is accessible |
| filesystem_circuitbreaker | Gauge | 9.5 | Whether or not the circuit for a certain shard is broken or not |
| circuitbreaker_storage_check_duration_seconds | Histogram | 10.3 | Time a single storage probe took |
| failed_login_captcha_total | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login |
| successful_login_captcha_total | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login |
......
......@@ -97,55 +97,6 @@ be stored via the **Application Settings** in the Admin area.
Beginning with GitLab 8.13.4, multiple paths can be chosen. New projects will be
randomly placed on one of the selected paths.
## Handling failing repository storage
> [Introduced][ce-11449] in GitLab 9.5.
When GitLab detects access to the repositories storage fails repeatedly, it can
gracefully prevent attempts to access the storage. This might be useful when
the repositories are stored somewhere on the network.
This can be configured from the admin interface:
![circuitbreaker configuration](img/circuitbreaker_config.png)
**Number of access attempts**: The number of attempts GitLab will make to access a
storage when probing a shard.
**Number of failures before backing off**: The number of failures after which
GitLab will start temporarily disabling access to a storage shard on a host.
**Maximum git storage failures:** The number of failures of after which GitLab will
completely prevent access to the storage. The number of failures can be reset in
the admin interface: `https://gitlab.example.com/admin/health_check` or using the
[api](../api/repository_storage_health.md) to allow access to the storage again.
**Seconds to wait after a storage failure:** When access to a storage fails. GitLab
will prevent access to the storage for the time specified here. This allows the
filesystem to recover.
**Seconds before reseting failure information:** The time in seconds GitLab will
keep failure information. When no failures occur during this time, information about the
mount is reset.
**Seconds to wait for a storage access attempt:** The time in seconds GitLab will
try to access storage. After this time a timeout error will be raised.
To enable the circuitbreaker for repository storage you can flip the feature flag from a rails console:
```
Feature.enable('git_storage_circuit_breaker')
```
Alternatively it can be enabled by setting `true` in the `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
This approach would be used when enabling the circuit breaker on a single host.
When storage failures occur, this will be visible in the admin interface like this:
![failing storage](img/failing_storage.png)
To allow access to all storages, click the `Reset git storage health information` button.
[ce-4578]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/4578
[restart-gitlab]: restart_gitlab.md#installations-from-source
[reconfigure-gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure
......
# Circuitbreaker API
> [Introduced][ce-11449] in GitLab 9.5.
The Circuitbreaker API is only accessible to administrators. All requests by
guests will respond with `401 Unauthorized`, and all requests by normal users
will respond with `403 Forbidden`.
## Repository Storages
### Get all storage information
Returns of all currently configured storages and their health information.
```
GET /circuit_breakers/repository_storage
```
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
```
```json
[
{
"storage_name": "default",
"failing_on_hosts": [],
"total_failures": 0
},
{
"storage_name": "broken",
"failing_on_hosts": [
"web01", "worker01"
],
"total_failures": 1
}
]
```
### Get failing storages
This returns a list of all currently failing storages.
```
GET /circuit_breakers/repository_storage/failing
```
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage/failing
```
```json
[
{
"storage_name":"broken",
"failing_on_hosts":["web01", "worker01"],
"total_failures":2
}
]
```
## Reset failing storage information
Use this remove all failing storage information and allow access to the storage again.
```
DELETE /circuit_breakers/repository_storage
```
```bash
curl --request DELETE --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
```
[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449
NOTE: **Deprecated:**
Support of the circuit breaker is removed, as Gitaly can be configured to
to work without NFS and [communicate solely over HTTP](../administration/gitaly/index.md).
......@@ -138,11 +138,6 @@ are listed in the descriptions of the relevant settings.
| `authorized_keys_enabled` | boolean | no | By default, we write to the `authorized_keys` file to support Git over SSH without additional configuration. GitLab can be optimized to authenticate SSH keys via the database file. Only disable this if you have configured your OpenSSH server to use the AuthorizedKeysCommand. |
| `auto_devops_domain` | string | no | Specify a domain to use by default for every project's Auto Review Apps and Auto Deploy stages. |
| `auto_devops_enabled` | boolean | no | Enable Auto DevOps for projects by default. It will automatically build, test, and deploy applications based on a predefined CI/CD configuration. |
| `circuitbreaker_access_retries` | integer | no | The number of attempts GitLab will make to access a storage. |
| `circuitbreaker_check_interval` | integer | no | Number of seconds in between storage checks. |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt. |
| `clientside_sentry_dsn` | string | required by: `clientside_sentry_enabled` | Clientside Sentry Data Source Name. |
| `clientside_sentry_enabled` | boolean | no | (**If enabled, requires:** `clientside_sentry_dsn`) Enable Sentry error reporting for the client side. |
| `container_registry_token_expire_delay` | integer | no | Container Registry token duration in minutes. |
......
......@@ -13,37 +13,24 @@ module API
end
resource ':type' do
namespace '', requirements: { type: 'repository_storage' } do
helpers do
def failing_storage_health
@failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages
end
def storage_health
@storage_health ||= Gitlab::Git::Storage::Health.for_all_storages
end
end
desc 'Get all git storages' do
detail 'This feature was introduced in GitLab 9.5'
success Entities::RepositoryStorageHealth
end
get do
present storage_health, with: Entities::RepositoryStorageHealth
present []
end
desc 'Get all failing git storages' do
detail 'This feature was introduced in GitLab 9.5'
success Entities::RepositoryStorageHealth
end
get 'failing' do
present failing_storage_health, with: Entities::RepositoryStorageHealth
present []
end
desc 'Reset all storage failures and open circuitbreaker' do
detail 'This feature was introduced in GitLab 9.5'
end
delete do
Gitlab::Git::Storage::FailureInfo.reset_all!
end
end
end
......
......@@ -1364,12 +1364,6 @@ module API
expose :submitted, as: :akismet_submitted
end
class RepositoryStorageHealth < Grape::Entity
expose :storage_name
expose :failing_on_hosts
expose :total_failures
end
class CustomAttribute < Grape::Entity
expose :key
expose :value
......
......@@ -96,10 +96,6 @@ module Gitlab
raise Gitlab::Git::CommandError.new(e.message)
end
def circuit_breaker
@circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage)
end
def exists?
gitaly_repository_client.exists?
end
......
module Gitlab
module Git
module Storage
class Inaccessible < StandardError
attr_reader :retry_after
def initialize(message = nil, retry_after = nil)
super(message)
@retry_after = retry_after
end
end
CircuitOpen = Class.new(Inaccessible)
Misconfiguration = Class.new(Inaccessible)
Failing = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
REDIS_KNOWN_KEYS = "#{REDIS_KEY_PREFIX}known_keys_set".freeze
def self.redis
Gitlab::Redis::SharedState
end
end
end
end
module Gitlab
module Git
module Storage
class Checker
include CircuitBreakerSettings
attr_reader :storage_path, :storage, :hostname, :logger
METRICS_MUTEX = Mutex.new
STORAGE_TIMING_BUCKETS = [0.1, 0.15, 0.25, 0.33, 0.5, 1, 1.5, 2.5, 5, 10, 15].freeze
def self.check_all(logger = Rails.logger)
threads = Gitlab.config.repositories.storages.keys.map do |storage_name|
Thread.new do
Thread.current[:result] = new(storage_name, logger).check_with_lease
end
end
threads.map do |thread|
thread.join
thread[:result]
end
end
def self.check_histogram
@check_histogram ||=
METRICS_MUTEX.synchronize do
@check_histogram || Gitlab::Metrics.histogram(:circuitbreaker_storage_check_duration_seconds,
'Storage check time in seconds',
{},
STORAGE_TIMING_BUCKETS
)
end
end
def initialize(storage, logger = Rails.logger)
@storage = storage
config = Gitlab.config.repositories.storages[@storage]
@storage_path = Gitlab::GitalyClient::StorageSettings.allow_disk_access { config.legacy_disk_path }
@logger = logger
@hostname = Gitlab::Environment.hostname
end
def check_with_lease
lease_key = "storage_check:#{cache_key}"
lease = Gitlab::ExclusiveLease.new(lease_key, timeout: storage_timeout)
result = { storage: storage, success: nil }
if uuid = lease.try_obtain
result[:success] = check
Gitlab::ExclusiveLease.cancel(lease_key, uuid)
else
logger.warn("#{hostname}: #{storage}: Skipping check, previous check still running")
end
result
end
def check
if perform_access_check
track_storage_accessible
true
else
track_storage_inaccessible
logger.error("#{hostname}: #{storage}: Not accessible.")
false
end
end
private
def perform_access_check
start_time = Gitlab::Metrics::System.monotonic_time
Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout, access_retries)
ensure
execution_time = Gitlab::Metrics::System.monotonic_time - start_time
self.class.check_histogram.observe({ storage: storage }, execution_time)
end
def track_storage_inaccessible
first_failure = current_failure_info.first_failure || Time.now
last_failure = Time.now
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, first_failure.to_i)
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hincrby(cache_key, :failure_count, 1)
redis.expire(cache_key, failure_reset_time)
maintain_known_keys(redis)
end
end
end
def track_storage_accessible
Gitlab::Git::Storage.redis.with do |redis|