summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorSean McGivern <sean@mcgivern.me.uk>2017-08-07 14:56:15 +0000
committerSean McGivern <sean@mcgivern.me.uk>2017-08-07 14:56:15 +0000
commit5bf65c9306960cdbe9b64bf8f62dc909d1cc6440 (patch)
tree22a4b3a199fae27c64e849c6f8c805c2b5008e5c /lib
parentfa716921b8b136f5d4b09de9723568d5ac101f84 (diff)
parentfda83a6179ae63f094410942e95951b40c3ad24b (diff)
downloadgitlab-ce-5bf65c9306960cdbe9b64bf8f62dc909d1cc6440.tar.gz
Merge branch 'bvl-nfs-circuitbreaker' into 'master'
Circuitbreaker for storage paths Closes #32207, #33117, gitlab-com/infrastructure#1946, and gitlab-com/infrastructure#1775 See merge request !11449
Diffstat (limited to 'lib')
-rw-r--r--lib/api/api.rb1
-rw-r--r--lib/api/circuit_breakers.rb50
-rw-r--r--lib/api/entities.rb6
-rw-r--r--lib/gitlab/environment.rb7
-rw-r--r--lib/gitlab/git/repository.rb8
-rw-r--r--lib/gitlab/git/storage.rb22
-rw-r--r--lib/gitlab/git/storage/circuit_breaker.rb144
-rw-r--r--lib/gitlab/git/storage/forked_storage_check.rb55
-rw-r--r--lib/gitlab/git/storage/health.rb91
-rw-r--r--lib/gitlab/health_checks/fs_shards_check.rb21
-rw-r--r--lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb2
-rw-r--r--lib/gitlab/prometheus/queries/environment_query.rb2
-rw-r--r--lib/gitlab/usage_data.rb4
13 files changed, 406 insertions, 7 deletions
diff --git a/lib/api/api.rb b/lib/api/api.rb
index 982a2b88d62..94df543853b 100644
--- a/lib/api/api.rb
+++ b/lib/api/api.rb
@@ -95,6 +95,7 @@ module API
mount ::API::Boards
mount ::API::Branches
mount ::API::BroadcastMessages
+ mount ::API::CircuitBreakers
mount ::API::Commits
mount ::API::CommitStatuses
mount ::API::DeployKeys
diff --git a/lib/api/circuit_breakers.rb b/lib/api/circuit_breakers.rb
new file mode 100644
index 00000000000..118883f5ea5
--- /dev/null
+++ b/lib/api/circuit_breakers.rb
@@ -0,0 +1,50 @@
+module API
+ class CircuitBreakers < Grape::API
+ before { authenticated_as_admin! }
+
+ resource :circuit_breakers do
+ params do
+ requires :type,
+ type: String,
+ desc: "The type of circuitbreaker",
+ values: ['repository_storage']
+ end
+ resource ':type' do
+ namespace '', requirements: { type: 'repository_storage' } do
+ helpers do
+ def failing_storage_health
+ @failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages
+ end
+
+ def storage_health
+ @failing_storage_health ||= Gitlab::Git::Storage::Health.for_all_storages
+ end
+ end
+
+ desc 'Get all failing git storages' do
+ detail 'This feature was introduced in GitLab 9.5'
+ success Entities::RepositoryStorageHealth
+ end
+ get do
+ present storage_health, with: Entities::RepositoryStorageHealth
+ end
+
+ desc 'Get all failing git storages' do
+ detail 'This feature was introduced in GitLab 9.5'
+ success Entities::RepositoryStorageHealth
+ end
+ get 'failing' do
+ present failing_storage_health, with: Entities::RepositoryStorageHealth
+ end
+
+ desc 'Reset all storage failures and open circuitbreaker' do
+ detail 'This feature was introduced in GitLab 9.5'
+ end
+ delete do
+ Gitlab::Git::Storage::CircuitBreaker.reset_all!
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/lib/api/entities.rb b/lib/api/entities.rb
index 3afa44a7428..94b438db499 100644
--- a/lib/api/entities.rb
+++ b/lib/api/entities.rb
@@ -951,5 +951,11 @@ module API
expose :ip_address
expose :submitted, as: :akismet_submitted
end
+
+ class RepositoryStorageHealth < Grape::Entity
+ expose :storage_name
+ expose :failing_on_hosts
+ expose :total_failures
+ end
end
end
diff --git a/lib/gitlab/environment.rb b/lib/gitlab/environment.rb
new file mode 100644
index 00000000000..5e0dd6e7859
--- /dev/null
+++ b/lib/gitlab/environment.rb
@@ -0,0 +1,7 @@
+module Gitlab
+ module Environment
+ def self.hostname
+ @hostname ||= ENV['HOSTNAME'] || Socket.gethostname
+ end
+ end
+end
diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb
index ae8e344f283..1005a819f95 100644
--- a/lib/gitlab/git/repository.rb
+++ b/lib/gitlab/git/repository.rb
@@ -59,11 +59,17 @@ module Gitlab
end
def rugged
- @rugged ||= Rugged::Repository.new(path, alternates: alternate_object_directories)
+ @rugged ||= circuit_breaker.perform do
+ Rugged::Repository.new(path, alternates: alternate_object_directories)
+ end
rescue Rugged::RepositoryError, Rugged::OSError
raise NoRepository.new('no repository for such path')
end
+ def circuit_breaker
+ @circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage)
+ end
+
# Returns an Array of branch names
# sorted by name ASC
def branch_names
diff --git a/lib/gitlab/git/storage.rb b/lib/gitlab/git/storage.rb
new file mode 100644
index 00000000000..e28be4b8a38
--- /dev/null
+++ b/lib/gitlab/git/storage.rb
@@ -0,0 +1,22 @@
+module Gitlab
+ module Git
+ module Storage
+ class Inaccessible < StandardError
+ attr_reader :retry_after
+
+ def initialize(message = nil, retry_after = nil)
+ super(message)
+ @retry_after = retry_after
+ end
+ end
+
+ CircuitOpen = Class.new(Inaccessible)
+
+ REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
+
+ def self.redis
+ Gitlab::Redis::SharedState
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb
new file mode 100644
index 00000000000..9ea9367d4b7
--- /dev/null
+++ b/lib/gitlab/git/storage/circuit_breaker.rb
@@ -0,0 +1,144 @@
+module Gitlab
+ module Git
+ module Storage
+ class CircuitBreaker
+ FailureInfo = Struct.new(:last_failure, :failure_count)
+
+ attr_reader :storage,
+ :hostname,
+ :storage_path,
+ :failure_count_threshold,
+ :failure_wait_time,
+ :failure_reset_time,
+ :storage_timeout
+
+ delegate :last_failure, :failure_count, to: :failure_info
+
+ def self.reset_all!
+ pattern = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}*"
+
+ Gitlab::Git::Storage.redis.with do |redis|
+ all_storage_keys = redis.keys(pattern)
+ redis.del(*all_storage_keys) unless all_storage_keys.empty?
+ end
+
+ RequestStore.delete(:circuitbreaker_cache)
+ end
+
+ def self.for_storage(storage)
+ cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
+ Hash.new do |hash, storage_name|
+ hash[storage_name] = new(storage_name)
+ end
+ end
+
+ cached_circuitbreakers[storage]
+ end
+
+ def initialize(storage, hostname = Gitlab::Environment.hostname)
+ @storage = storage
+ @hostname = hostname
+
+ config = Gitlab.config.repositories.storages[@storage]
+ @storage_path = config['path']
+ @failure_count_threshold = config['failure_count_threshold']
+ @failure_wait_time = config['failure_wait_time']
+ @failure_reset_time = config['failure_reset_time']
+ @storage_timeout = config['storage_timeout']
+ end
+
+ def perform
+ return yield unless Feature.enabled?('git_storage_circuit_breaker')
+
+ check_storage_accessible!
+
+ yield
+ end
+
+ def circuit_broken?
+ return false if no_failures?
+
+ recent_failure = last_failure > failure_wait_time.seconds.ago
+ too_many_failures = failure_count > failure_count_threshold
+
+ recent_failure || too_many_failures
+ end
+
+ # Memoizing the `storage_available` call means we only do it once per
+ # request when the storage is available.
+ #
+ # When the storage appears not available, and the memoized value is `false`
+ # we might want to try again.
+ def storage_available?
+ return @storage_available if @storage_available
+
+ if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
+ .storage_available?(storage_path, storage_timeout)
+ track_storage_accessible
+ else
+ track_storage_inaccessible
+ end
+
+ @storage_available
+ end
+
+ def check_storage_accessible!
+ if circuit_broken?
+ raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time)
+ end
+
+ unless storage_available?
+ raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time)
+ end
+ end
+
+ def no_failures?
+ last_failure.blank? && failure_count == 0
+ end
+
+ def track_storage_inaccessible
+ @failure_info = FailureInfo.new(Time.now, failure_count + 1)
+
+ Gitlab::Git::Storage.redis.with do |redis|
+ redis.pipelined do
+ redis.hset(cache_key, :last_failure, last_failure.to_i)
+ redis.hincrby(cache_key, :failure_count, 1)
+ redis.expire(cache_key, failure_reset_time)
+ end
+ end
+ end
+
+ def track_storage_accessible
+ return if no_failures?
+
+ @failure_info = FailureInfo.new(nil, 0)
+
+ Gitlab::Git::Storage.redis.with do |redis|
+ redis.pipelined do
+ redis.hset(cache_key, :last_failure, nil)
+ redis.hset(cache_key, :failure_count, 0)
+ end
+ end
+ end
+
+ def failure_info
+ @failure_info ||= get_failure_info
+ end
+
+ def get_failure_info
+ last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
+ redis.hmget(cache_key, :last_failure, :failure_count)
+ end
+
+ last_failure = Time.at(last_failure.to_i) if last_failure.present?
+
+ FailureInfo.new(last_failure, failure_count.to_i)
+ end
+
+ def cache_key
+ @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/git/storage/forked_storage_check.rb b/lib/gitlab/git/storage/forked_storage_check.rb
new file mode 100644
index 00000000000..91d8241f17b
--- /dev/null
+++ b/lib/gitlab/git/storage/forked_storage_check.rb
@@ -0,0 +1,55 @@
+module Gitlab
+ module Git
+ module Storage
+ module ForkedStorageCheck
+ extend self
+
+ def storage_available?(path, timeout_seconds = 5)
+ status = timeout_check(path, timeout_seconds)
+
+ status.success?
+ end
+
+ def timeout_check(path, timeout_seconds)
+ filesystem_check_pid = check_filesystem_in_process(path)
+
+ deadline = timeout_seconds.seconds.from_now.utc
+ wait_time = 0.01
+ status = nil
+
+ while status.nil?
+ if deadline > Time.now.utc
+ sleep(wait_time)
+ _pid, status = Process.wait2(filesystem_check_pid, Process::WNOHANG)
+ else
+ Process.kill('KILL', filesystem_check_pid)
+ # Blocking wait, so we are sure the process is gone before continuing
+ _pid, status = Process.wait2(filesystem_check_pid)
+ end
+ end
+
+ status
+ end
+
+ # This will spawn a new 2 processes to do the check:
+ # The outer child (waiter) will spawn another child process (stater).
+ #
+ # The stater is the process is performing the actual filesystem check
+ # the check might hang if the filesystem is acting up.
+ # In this case we will send a `KILL` to the waiter, which will still
+ # be responsive while the stater is hanging.
+ def check_filesystem_in_process(path)
+ spawn('ruby', '-e', ruby_check, path, [:out, :err] => '/dev/null')
+ end
+
+ def ruby_check
+ <<~RUBY_FILESYSTEM_CHECK
+ inner_pid = fork { File.stat(ARGV.first) }
+ Process.waitpid(inner_pid)
+ exit $?.exitstatus
+ RUBY_FILESYSTEM_CHECK
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/git/storage/health.rb b/lib/gitlab/git/storage/health.rb
new file mode 100644
index 00000000000..2d723147f4f
--- /dev/null
+++ b/lib/gitlab/git/storage/health.rb
@@ -0,0 +1,91 @@
+module Gitlab
+ module Git
+ module Storage
+ class Health
+ attr_reader :storage_name, :info
+
+ def self.pattern_for_storage(storage_name)
+ "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage_name}:*"
+ end
+
+ def self.for_all_storages
+ storage_names = Gitlab.config.repositories.storages.keys
+ results_per_storage = nil
+
+ Gitlab::Git::Storage.redis.with do |redis|
+ keys_per_storage = all_keys_for_storages(storage_names, redis)
+ results_per_storage = load_for_keys(keys_per_storage, redis)
+ end
+
+ results_per_storage.map do |name, info|
+ info.each { |i| i[:failure_count] = i[:failure_count].value.to_i }
+ new(name, info)
+ end
+ end
+
+ def self.all_keys_for_storages(storage_names, redis)
+ keys_per_storage = {}
+
+ redis.pipelined do
+ storage_names.each do |storage_name|
+ pattern = pattern_for_storage(storage_name)
+
+ keys_per_storage[storage_name] = redis.keys(pattern)
+ end
+ end
+
+ keys_per_storage
+ end
+
+ def self.load_for_keys(keys_per_storage, redis)
+ info_for_keys = {}
+
+ redis.pipelined do
+ keys_per_storage.each do |storage_name, keys_future|
+ info_for_storage = keys_future.value.map do |key|
+ { name: key, failure_count: redis.hget(key, :failure_count) }
+ end
+
+ info_for_keys[storage_name] = info_for_storage
+ end
+ end
+
+ info_for_keys
+ end
+
+ def self.for_failing_storages
+ for_all_storages.select(&:failing?)
+ end
+
+ def initialize(storage_name, info)
+ @storage_name = storage_name
+ @info = info
+ end
+
+ def failing_info
+ @failing_info ||= info.select { |info_for_host| info_for_host[:failure_count] > 0 }
+ end
+
+ def failing?
+ failing_info.any?
+ end
+
+ def failing_on_hosts
+ @failing_on_hosts ||= failing_info.map do |info_for_host|
+ info_for_host[:name].split(':').last
+ end
+ end
+
+ def failing_circuit_breakers
+ @failing_circuit_breakers ||= failing_on_hosts.map do |hostname|
+ CircuitBreaker.new(storage_name, hostname)
+ end
+ end
+
+ def total_failures
+ @total_failures ||= failing_info.sum { |info_for_host| info_for_host[:failure_count] }
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb
index 9e91c135956..eef97f54962 100644
--- a/lib/gitlab/health_checks/fs_shards_check.rb
+++ b/lib/gitlab/health_checks/fs_shards_check.rb
@@ -10,7 +10,9 @@ module Gitlab
def readiness
repository_storages.map do |storage_name|
begin
- if !storage_stat_test(storage_name)
+ if !storage_circuitbreaker_test(storage_name)
+ HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
+ elsif !storage_stat_test(storage_name)
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
else
with_temp_file(storage_name) do |tmp_file_path|
@@ -36,7 +38,8 @@ module Gitlab
[
storage_stat_metrics(storage_name),
storage_write_metrics(storage_name),
- storage_read_metrics(storage_name)
+ storage_read_metrics(storage_name),
+ storage_circuitbreaker_metrics(storage_name)
].flatten
end
end
@@ -121,6 +124,12 @@ module Gitlab
file_contents == RANDOM_STRING
end
+ def storage_circuitbreaker_test(storage_name)
+ Gitlab::Git::Storage::CircuitBreaker.new(storage_name).perform { "OK" }
+ rescue Gitlab::Git::Storage::Inaccessible
+ nil
+ end
+
def storage_stat_metrics(storage_name)
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
with_timing { storage_stat_test(storage_name) }
@@ -143,6 +152,14 @@ module Gitlab
end
end
end
+
+ def storage_circuitbreaker_metrics(storage_name)
+ operation_metrics(:filesystem_circuitbreaker,
+ :filesystem_circuitbreaker_latency_seconds,
+ shard: storage_name) do
+ with_timing { storage_circuitbreaker_test(storage_name) }
+ end
+ end
end
end
end
diff --git a/lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb b/lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb
index db4708b22e4..32fe8201a8d 100644
--- a/lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb
+++ b/lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb
@@ -5,7 +5,7 @@ module Gitlab
include QueryAdditionalMetrics
def query(environment_id)
- Environment.find_by(id: environment_id).try do |environment|
+ ::Environment.find_by(id: environment_id).try do |environment|
query_metrics(
common_query_context(environment, timeframe_start: 8.hours.ago.to_f, timeframe_end: Time.now.to_f)
)
diff --git a/lib/gitlab/prometheus/queries/environment_query.rb b/lib/gitlab/prometheus/queries/environment_query.rb
index 66f29d95177..1d17d3cfd56 100644
--- a/lib/gitlab/prometheus/queries/environment_query.rb
+++ b/lib/gitlab/prometheus/queries/environment_query.rb
@@ -3,7 +3,7 @@ module Gitlab
module Queries
class EnvironmentQuery < BaseQuery
def query(environment_id)
- Environment.find_by(id: environment_id).try do |environment|
+ ::Environment.find_by(id: environment_id).try do |environment|
environment_slug = environment.slug
timeframe_start = 8.hours.ago.to_f
timeframe_end = Time.now.to_f
diff --git a/lib/gitlab/usage_data.rb b/lib/gitlab/usage_data.rb
index e0ac21305a5..748e0a29184 100644
--- a/lib/gitlab/usage_data.rb
+++ b/lib/gitlab/usage_data.rb
@@ -27,8 +27,8 @@ module Gitlab
ci_pipeline_schedules: ::Ci::PipelineSchedule.count,
deploy_keys: DeployKey.count,
deployments: Deployment.count,
- environments: Environment.count,
- in_review_folder: Environment.in_review_folder.count,
+ environments: ::Environment.count,
+ in_review_folder: ::Environment.in_review_folder.count,
groups: Group.count,
issues: Issue.count,
keys: Key.count,