diff options
| author | Sean McGivern <sean@mcgivern.me.uk> | 2017-08-07 14:56:15 +0000 |
|---|---|---|
| committer | Sean McGivern <sean@mcgivern.me.uk> | 2017-08-07 14:56:15 +0000 |
| commit | 5bf65c9306960cdbe9b64bf8f62dc909d1cc6440 (patch) | |
| tree | 22a4b3a199fae27c64e849c6f8c805c2b5008e5c /lib | |
| parent | fa716921b8b136f5d4b09de9723568d5ac101f84 (diff) | |
| parent | fda83a6179ae63f094410942e95951b40c3ad24b (diff) | |
| download | gitlab-ce-5bf65c9306960cdbe9b64bf8f62dc909d1cc6440.tar.gz | |
Merge branch 'bvl-nfs-circuitbreaker' into 'master'
Circuitbreaker for storage paths
Closes #32207, #33117, gitlab-com/infrastructure#1946, and gitlab-com/infrastructure#1775
See merge request !11449
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/api/api.rb | 1 | ||||
| -rw-r--r-- | lib/api/circuit_breakers.rb | 50 | ||||
| -rw-r--r-- | lib/api/entities.rb | 6 | ||||
| -rw-r--r-- | lib/gitlab/environment.rb | 7 | ||||
| -rw-r--r-- | lib/gitlab/git/repository.rb | 8 | ||||
| -rw-r--r-- | lib/gitlab/git/storage.rb | 22 | ||||
| -rw-r--r-- | lib/gitlab/git/storage/circuit_breaker.rb | 144 | ||||
| -rw-r--r-- | lib/gitlab/git/storage/forked_storage_check.rb | 55 | ||||
| -rw-r--r-- | lib/gitlab/git/storage/health.rb | 91 | ||||
| -rw-r--r-- | lib/gitlab/health_checks/fs_shards_check.rb | 21 | ||||
| -rw-r--r-- | lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb | 2 | ||||
| -rw-r--r-- | lib/gitlab/prometheus/queries/environment_query.rb | 2 | ||||
| -rw-r--r-- | lib/gitlab/usage_data.rb | 4 |
13 files changed, 406 insertions, 7 deletions
diff --git a/lib/api/api.rb b/lib/api/api.rb index 982a2b88d62..94df543853b 100644 --- a/lib/api/api.rb +++ b/lib/api/api.rb @@ -95,6 +95,7 @@ module API mount ::API::Boards mount ::API::Branches mount ::API::BroadcastMessages + mount ::API::CircuitBreakers mount ::API::Commits mount ::API::CommitStatuses mount ::API::DeployKeys diff --git a/lib/api/circuit_breakers.rb b/lib/api/circuit_breakers.rb new file mode 100644 index 00000000000..118883f5ea5 --- /dev/null +++ b/lib/api/circuit_breakers.rb @@ -0,0 +1,50 @@ +module API + class CircuitBreakers < Grape::API + before { authenticated_as_admin! } + + resource :circuit_breakers do + params do + requires :type, + type: String, + desc: "The type of circuitbreaker", + values: ['repository_storage'] + end + resource ':type' do + namespace '', requirements: { type: 'repository_storage' } do + helpers do + def failing_storage_health + @failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages + end + + def storage_health + @failing_storage_health ||= Gitlab::Git::Storage::Health.for_all_storages + end + end + + desc 'Get all failing git storages' do + detail 'This feature was introduced in GitLab 9.5' + success Entities::RepositoryStorageHealth + end + get do + present storage_health, with: Entities::RepositoryStorageHealth + end + + desc 'Get all failing git storages' do + detail 'This feature was introduced in GitLab 9.5' + success Entities::RepositoryStorageHealth + end + get 'failing' do + present failing_storage_health, with: Entities::RepositoryStorageHealth + end + + desc 'Reset all storage failures and open circuitbreaker' do + detail 'This feature was introduced in GitLab 9.5' + end + delete do + Gitlab::Git::Storage::CircuitBreaker.reset_all! + end + end + end + end + end +end diff --git a/lib/api/entities.rb b/lib/api/entities.rb index 3afa44a7428..94b438db499 100644 --- a/lib/api/entities.rb +++ b/lib/api/entities.rb @@ -951,5 +951,11 @@ module API expose :ip_address expose :submitted, as: :akismet_submitted end + + class RepositoryStorageHealth < Grape::Entity + expose :storage_name + expose :failing_on_hosts + expose :total_failures + end end end diff --git a/lib/gitlab/environment.rb b/lib/gitlab/environment.rb new file mode 100644 index 00000000000..5e0dd6e7859 --- /dev/null +++ b/lib/gitlab/environment.rb @@ -0,0 +1,7 @@ +module Gitlab + module Environment + def self.hostname + @hostname ||= ENV['HOSTNAME'] || Socket.gethostname + end + end +end diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb index ae8e344f283..1005a819f95 100644 --- a/lib/gitlab/git/repository.rb +++ b/lib/gitlab/git/repository.rb @@ -59,11 +59,17 @@ module Gitlab end def rugged - @rugged ||= Rugged::Repository.new(path, alternates: alternate_object_directories) + @rugged ||= circuit_breaker.perform do + Rugged::Repository.new(path, alternates: alternate_object_directories) + end rescue Rugged::RepositoryError, Rugged::OSError raise NoRepository.new('no repository for such path') end + def circuit_breaker + @circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage) + end + # Returns an Array of branch names # sorted by name ASC def branch_names diff --git a/lib/gitlab/git/storage.rb b/lib/gitlab/git/storage.rb new file mode 100644 index 00000000000..e28be4b8a38 --- /dev/null +++ b/lib/gitlab/git/storage.rb @@ -0,0 +1,22 @@ +module Gitlab + module Git + module Storage + class Inaccessible < StandardError + attr_reader :retry_after + + def initialize(message = nil, retry_after = nil) + super(message) + @retry_after = retry_after + end + end + + CircuitOpen = Class.new(Inaccessible) + + REDIS_KEY_PREFIX = 'storage_accessible:'.freeze + + def self.redis + Gitlab::Redis::SharedState + end + end + end +end diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb new file mode 100644 index 00000000000..9ea9367d4b7 --- /dev/null +++ b/lib/gitlab/git/storage/circuit_breaker.rb @@ -0,0 +1,144 @@ +module Gitlab + module Git + module Storage + class CircuitBreaker + FailureInfo = Struct.new(:last_failure, :failure_count) + + attr_reader :storage, + :hostname, + :storage_path, + :failure_count_threshold, + :failure_wait_time, + :failure_reset_time, + :storage_timeout + + delegate :last_failure, :failure_count, to: :failure_info + + def self.reset_all! + pattern = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}*" + + Gitlab::Git::Storage.redis.with do |redis| + all_storage_keys = redis.keys(pattern) + redis.del(*all_storage_keys) unless all_storage_keys.empty? + end + + RequestStore.delete(:circuitbreaker_cache) + end + + def self.for_storage(storage) + cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do + Hash.new do |hash, storage_name| + hash[storage_name] = new(storage_name) + end + end + + cached_circuitbreakers[storage] + end + + def initialize(storage, hostname = Gitlab::Environment.hostname) + @storage = storage + @hostname = hostname + + config = Gitlab.config.repositories.storages[@storage] + @storage_path = config['path'] + @failure_count_threshold = config['failure_count_threshold'] + @failure_wait_time = config['failure_wait_time'] + @failure_reset_time = config['failure_reset_time'] + @storage_timeout = config['storage_timeout'] + end + + def perform + return yield unless Feature.enabled?('git_storage_circuit_breaker') + + check_storage_accessible! + + yield + end + + def circuit_broken? + return false if no_failures? + + recent_failure = last_failure > failure_wait_time.seconds.ago + too_many_failures = failure_count > failure_count_threshold + + recent_failure || too_many_failures + end + + # Memoizing the `storage_available` call means we only do it once per + # request when the storage is available. + # + # When the storage appears not available, and the memoized value is `false` + # we might want to try again. + def storage_available? + return @storage_available if @storage_available + + if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck + .storage_available?(storage_path, storage_timeout) + track_storage_accessible + else + track_storage_inaccessible + end + + @storage_available + end + + def check_storage_accessible! + if circuit_broken? + raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time) + end + + unless storage_available? + raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time) + end + end + + def no_failures? + last_failure.blank? && failure_count == 0 + end + + def track_storage_inaccessible + @failure_info = FailureInfo.new(Time.now, failure_count + 1) + + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.hset(cache_key, :last_failure, last_failure.to_i) + redis.hincrby(cache_key, :failure_count, 1) + redis.expire(cache_key, failure_reset_time) + end + end + end + + def track_storage_accessible + return if no_failures? + + @failure_info = FailureInfo.new(nil, 0) + + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.hset(cache_key, :last_failure, nil) + redis.hset(cache_key, :failure_count, 0) + end + end + end + + def failure_info + @failure_info ||= get_failure_info + end + + def get_failure_info + last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| + redis.hmget(cache_key, :last_failure, :failure_count) + end + + last_failure = Time.at(last_failure.to_i) if last_failure.present? + + FailureInfo.new(last_failure, failure_count.to_i) + end + + def cache_key + @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" + end + end + end + end +end diff --git a/lib/gitlab/git/storage/forked_storage_check.rb b/lib/gitlab/git/storage/forked_storage_check.rb new file mode 100644 index 00000000000..91d8241f17b --- /dev/null +++ b/lib/gitlab/git/storage/forked_storage_check.rb @@ -0,0 +1,55 @@ +module Gitlab + module Git + module Storage + module ForkedStorageCheck + extend self + + def storage_available?(path, timeout_seconds = 5) + status = timeout_check(path, timeout_seconds) + + status.success? + end + + def timeout_check(path, timeout_seconds) + filesystem_check_pid = check_filesystem_in_process(path) + + deadline = timeout_seconds.seconds.from_now.utc + wait_time = 0.01 + status = nil + + while status.nil? + if deadline > Time.now.utc + sleep(wait_time) + _pid, status = Process.wait2(filesystem_check_pid, Process::WNOHANG) + else + Process.kill('KILL', filesystem_check_pid) + # Blocking wait, so we are sure the process is gone before continuing + _pid, status = Process.wait2(filesystem_check_pid) + end + end + + status + end + + # This will spawn a new 2 processes to do the check: + # The outer child (waiter) will spawn another child process (stater). + # + # The stater is the process is performing the actual filesystem check + # the check might hang if the filesystem is acting up. + # In this case we will send a `KILL` to the waiter, which will still + # be responsive while the stater is hanging. + def check_filesystem_in_process(path) + spawn('ruby', '-e', ruby_check, path, [:out, :err] => '/dev/null') + end + + def ruby_check + <<~RUBY_FILESYSTEM_CHECK + inner_pid = fork { File.stat(ARGV.first) } + Process.waitpid(inner_pid) + exit $?.exitstatus + RUBY_FILESYSTEM_CHECK + end + end + end + end +end diff --git a/lib/gitlab/git/storage/health.rb b/lib/gitlab/git/storage/health.rb new file mode 100644 index 00000000000..2d723147f4f --- /dev/null +++ b/lib/gitlab/git/storage/health.rb @@ -0,0 +1,91 @@ +module Gitlab + module Git + module Storage + class Health + attr_reader :storage_name, :info + + def self.pattern_for_storage(storage_name) + "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage_name}:*" + end + + def self.for_all_storages + storage_names = Gitlab.config.repositories.storages.keys + results_per_storage = nil + + Gitlab::Git::Storage.redis.with do |redis| + keys_per_storage = all_keys_for_storages(storage_names, redis) + results_per_storage = load_for_keys(keys_per_storage, redis) + end + + results_per_storage.map do |name, info| + info.each { |i| i[:failure_count] = i[:failure_count].value.to_i } + new(name, info) + end + end + + def self.all_keys_for_storages(storage_names, redis) + keys_per_storage = {} + + redis.pipelined do + storage_names.each do |storage_name| + pattern = pattern_for_storage(storage_name) + + keys_per_storage[storage_name] = redis.keys(pattern) + end + end + + keys_per_storage + end + + def self.load_for_keys(keys_per_storage, redis) + info_for_keys = {} + + redis.pipelined do + keys_per_storage.each do |storage_name, keys_future| + info_for_storage = keys_future.value.map do |key| + { name: key, failure_count: redis.hget(key, :failure_count) } + end + + info_for_keys[storage_name] = info_for_storage + end + end + + info_for_keys + end + + def self.for_failing_storages + for_all_storages.select(&:failing?) + end + + def initialize(storage_name, info) + @storage_name = storage_name + @info = info + end + + def failing_info + @failing_info ||= info.select { |info_for_host| info_for_host[:failure_count] > 0 } + end + + def failing? + failing_info.any? + end + + def failing_on_hosts + @failing_on_hosts ||= failing_info.map do |info_for_host| + info_for_host[:name].split(':').last + end + end + + def failing_circuit_breakers + @failing_circuit_breakers ||= failing_on_hosts.map do |hostname| + CircuitBreaker.new(storage_name, hostname) + end + end + + def total_failures + @total_failures ||= failing_info.sum { |info_for_host| info_for_host[:failure_count] } + end + end + end + end +end diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb index 9e91c135956..eef97f54962 100644 --- a/lib/gitlab/health_checks/fs_shards_check.rb +++ b/lib/gitlab/health_checks/fs_shards_check.rb @@ -10,7 +10,9 @@ module Gitlab def readiness repository_storages.map do |storage_name| begin - if !storage_stat_test(storage_name) + if !storage_circuitbreaker_test(storage_name) + HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name) + elsif !storage_stat_test(storage_name) HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) else with_temp_file(storage_name) do |tmp_file_path| @@ -36,7 +38,8 @@ module Gitlab [ storage_stat_metrics(storage_name), storage_write_metrics(storage_name), - storage_read_metrics(storage_name) + storage_read_metrics(storage_name), + storage_circuitbreaker_metrics(storage_name) ].flatten end end @@ -121,6 +124,12 @@ module Gitlab file_contents == RANDOM_STRING end + def storage_circuitbreaker_test(storage_name) + Gitlab::Git::Storage::CircuitBreaker.new(storage_name).perform { "OK" } + rescue Gitlab::Git::Storage::Inaccessible + nil + end + def storage_stat_metrics(storage_name) operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do with_timing { storage_stat_test(storage_name) } @@ -143,6 +152,14 @@ module Gitlab end end end + + def storage_circuitbreaker_metrics(storage_name) + operation_metrics(:filesystem_circuitbreaker, + :filesystem_circuitbreaker_latency_seconds, + shard: storage_name) do + with_timing { storage_circuitbreaker_test(storage_name) } + end + end end end end diff --git a/lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb b/lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb index db4708b22e4..32fe8201a8d 100644 --- a/lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb +++ b/lib/gitlab/prometheus/queries/additional_metrics_environment_query.rb @@ -5,7 +5,7 @@ module Gitlab include QueryAdditionalMetrics def query(environment_id) - Environment.find_by(id: environment_id).try do |environment| + ::Environment.find_by(id: environment_id).try do |environment| query_metrics( common_query_context(environment, timeframe_start: 8.hours.ago.to_f, timeframe_end: Time.now.to_f) ) diff --git a/lib/gitlab/prometheus/queries/environment_query.rb b/lib/gitlab/prometheus/queries/environment_query.rb index 66f29d95177..1d17d3cfd56 100644 --- a/lib/gitlab/prometheus/queries/environment_query.rb +++ b/lib/gitlab/prometheus/queries/environment_query.rb @@ -3,7 +3,7 @@ module Gitlab module Queries class EnvironmentQuery < BaseQuery def query(environment_id) - Environment.find_by(id: environment_id).try do |environment| + ::Environment.find_by(id: environment_id).try do |environment| environment_slug = environment.slug timeframe_start = 8.hours.ago.to_f timeframe_end = Time.now.to_f diff --git a/lib/gitlab/usage_data.rb b/lib/gitlab/usage_data.rb index e0ac21305a5..748e0a29184 100644 --- a/lib/gitlab/usage_data.rb +++ b/lib/gitlab/usage_data.rb @@ -27,8 +27,8 @@ module Gitlab ci_pipeline_schedules: ::Ci::PipelineSchedule.count, deploy_keys: DeployKey.count, deployments: Deployment.count, - environments: Environment.count, - in_review_folder: Environment.in_review_folder.count, + environments: ::Environment.count, + in_review_folder: ::Environment.in_review_folder.count, groups: Group.count, issues: Issue.count, keys: Key.count, |
