diff options
author | Bob Van Landuyt <bob@vanlanduyt.co> | 2017-10-19 08:32:55 +0200 |
---|---|---|
committer | Bob Van Landuyt <bob@vanlanduyt.co> | 2017-10-23 12:02:23 +0300 |
commit | 430e7671397a1c022b88da31328a5a81409671b5 (patch) | |
tree | 44dfbf4deb769d418968ba1505c9d1b2fa4b92f2 | |
parent | 1881d4f8ecbf52afd7bc732cd6c1296fafd38405 (diff) | |
download | gitlab-ce-430e7671397a1c022b88da31328a5a81409671b5.tar.gz |
Implement backoff for the circuitbreaker
The circuitbreaker now has 2 failure modes:
- Backing off: This will raise the `Gitlab::Git::Storage::Failing`
exception. Access to the shard is blocked temporarily.
- Circuit broken: This will raise the
`Gitlab::Git::Storage::CircuitBroken` exception. Access to the shard
will be blocked until the failures are reset.
-rw-r--r-- | app/helpers/storage_health_helper.rb | 5 | ||||
-rw-r--r-- | lib/gitlab/git/storage.rb | 1 | ||||
-rw-r--r-- | lib/gitlab/git/storage/circuit_breaker.rb | 28 | ||||
-rw-r--r-- | lib/gitlab/git/storage/circuit_breaker_settings.rb | 8 | ||||
-rw-r--r-- | lib/gitlab/git/storage/null_circuit_breaker.rb | 4 | ||||
-rw-r--r-- | spec/lib/gitlab/git/storage/circuit_breaker_spec.rb | 259 | ||||
-rw-r--r-- | spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb | 13 |
7 files changed, 128 insertions, 190 deletions
diff --git a/app/helpers/storage_health_helper.rb b/app/helpers/storage_health_helper.rb index 544c9efb845..4d2180f7eee 100644 --- a/app/helpers/storage_health_helper.rb +++ b/app/helpers/storage_health_helper.rb @@ -16,17 +16,16 @@ module StorageHealthHelper def message_for_circuit_breaker(circuit_breaker) maximum_failures = circuit_breaker.failure_count_threshold current_failures = circuit_breaker.failure_count - permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures translation_params = { number_of_failures: current_failures, maximum_failures: maximum_failures, number_of_seconds: circuit_breaker.failure_wait_time } - if permanently_broken + if circuit_breaker.circuit_broken? s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ "retry automatically. Reset storage information when the problem is "\ "resolved.") % translation_params - elsif circuit_breaker.circuit_broken? + elsif circuit_breaker.backing_off? _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ "block access for %{number_of_seconds} seconds.") % translation_params else diff --git a/lib/gitlab/git/storage.rb b/lib/gitlab/git/storage.rb index 08e6c29abad..99518c9b1e4 100644 --- a/lib/gitlab/git/storage.rb +++ b/lib/gitlab/git/storage.rb @@ -12,6 +12,7 @@ module Gitlab CircuitOpen = Class.new(Inaccessible) Misconfiguration = Class.new(Inaccessible) + Failing = Class.new(Inaccessible) REDIS_KEY_PREFIX = 'storage_accessible:'.freeze diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb index 0456ad9a1f3..2ce97ff41f9 100644 --- a/lib/gitlab/git/storage/circuit_breaker.rb +++ b/lib/gitlab/git/storage/circuit_breaker.rb @@ -64,12 +64,20 @@ module Gitlab def circuit_broken? return false if no_failures? + failure_count > failure_count_threshold + end + + def backing_off? + return false if no_failures? + recent_failure = last_failure > failure_wait_time.seconds.ago - too_many_failures = failure_count > failure_count_threshold + too_many_failures = failure_count > backoff_threshold - recent_failure || too_many_failures + recent_failure && too_many_failures end + private + def failure_info @failure_info ||= get_failure_info end @@ -94,7 +102,11 @@ module Gitlab def check_storage_accessible! if circuit_broken? - raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time) + raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time) + end + + if backing_off? + raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time) end unless storage_available? @@ -131,12 +143,6 @@ module Gitlab end end - def cache_key - @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" - end - - private - def get_failure_info last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| redis.hmget(cache_key, :last_failure, :failure_count) @@ -146,6 +152,10 @@ module Gitlab FailureInfo.new(last_failure, failure_count.to_i) end + + def cache_key + @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" + end end end end diff --git a/lib/gitlab/git/storage/circuit_breaker_settings.rb b/lib/gitlab/git/storage/circuit_breaker_settings.rb index d2313fe7c1b..257fe8cd8f0 100644 --- a/lib/gitlab/git/storage/circuit_breaker_settings.rb +++ b/lib/gitlab/git/storage/circuit_breaker_settings.rb @@ -18,6 +18,14 @@ module Gitlab application_settings.circuitbreaker_storage_timeout end + def access_retries + application_settings.circuitbreaker_access_retries + end + + def backoff_threshold + application_settings.circuitbreaker_backoff_threshold + end + private def application_settings diff --git a/lib/gitlab/git/storage/null_circuit_breaker.rb b/lib/gitlab/git/storage/null_circuit_breaker.rb index 60c6791a7e4..a12d52d295f 100644 --- a/lib/gitlab/git/storage/null_circuit_breaker.rb +++ b/lib/gitlab/git/storage/null_circuit_breaker.rb @@ -25,6 +25,10 @@ module Gitlab !!@error end + def backing_off? + false + end + def last_failure circuit_broken? ? Time.now : nil end diff --git a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb index c8d532df059..e3f221aa863 100644 --- a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb +++ b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb @@ -79,7 +79,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: stub_application_setting(circuitbreaker_failure_count_threshold: 0, circuitbreaker_failure_wait_time: 1, circuitbreaker_failure_reset_time: 2, - circuitbreaker_storage_timeout: 3) + circuitbreaker_storage_timeout: 3, + circuitbreaker_access_retries: 4, + circuitbreaker_backoff_threshold: 5) end describe '#failure_count_threshold' do @@ -105,14 +107,43 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: expect(circuit_breaker.storage_timeout).to eq(3) end end + + describe '#access_retries' do + it 'reads the value from settings' do + expect(circuit_breaker.access_retries).to eq(4) + end + end + + describe '#backoff_threshold' do + it 'reads the value from settings' do + expect(circuit_breaker.backoff_threshold).to eq(5) + end + end end describe '#perform' do - it 'raises an exception with retry time when the circuit is open' do - allow(circuit_breaker).to receive(:circuit_broken?).and_return(true) + it 'raises the correct exception when the circuit is open' do + set_in_redis(:last_failure, 1.day.ago.to_f) + set_in_redis(:failure_count, 999) expect { |b| circuit_breaker.perform(&b) } - .to raise_error(Gitlab::Git::Storage::CircuitOpen) + .to raise_error do |exception| + expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen) + expect(exception.retry_after).to eq(1800) + end + end + + it 'raises the correct exception when backing off' do + Timecop.freeze do + set_in_redis(:last_failure, 1.second.ago.to_f) + set_in_redis(:failure_count, 90) + + expect { |b| circuit_breaker.perform(&b) } + .to raise_error do |exception| + expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing) + expect(exception.retry_after).to eq(30) + end + end end it 'yields the block' do @@ -122,6 +153,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: it 'checks if the storage is available' do expect(circuit_breaker).to receive(:check_storage_accessible!) + .and_call_original circuit_breaker.perform { 'hello world' } end @@ -137,201 +169,102 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: .to raise_error(Rugged::OSError) end - context 'with the feature disabled' do - it 'returns the block without checking accessibility' do - stub_feature_flags(git_storage_circuit_breaker: false) - - expect(circuit_breaker).not_to receive(:circuit_broken?) - - result = circuit_breaker.perform { 'hello' } - - expect(result).to eq('hello') - end - end - end - - describe '#circuit_broken?' do - it 'is working when there is no last failure' do - set_in_redis(:last_failure, nil) - set_in_redis(:failure_count, 0) - - expect(circuit_breaker.circuit_broken?).to be_falsey - end - - it 'is broken when there was a recent failure' do - Timecop.freeze do - set_in_redis(:last_failure, 1.second.ago.to_f) - set_in_redis(:failure_count, 1) - - expect(circuit_breaker.circuit_broken?).to be_truthy - end - end - - it 'is broken when there are too many failures' do - set_in_redis(:last_failure, 1.day.ago.to_f) - set_in_redis(:failure_count, 200) - - expect(circuit_breaker.circuit_broken?).to be_truthy - end - - context 'the `failure_wait_time` is set to 0' do - before do - stub_application_setting(circuitbreaker_failure_wait_time: 0) - end + it 'tracks that the storage was accessible' do + set_in_redis(:failure_count, 10) + set_in_redis(:last_failure, Time.now.to_f) - it 'is working even when there is a recent failure' do - Timecop.freeze do - set_in_redis(:last_failure, 0.seconds.ago.to_f) - set_in_redis(:failure_count, 1) + circuit_breaker.perform { '' } - expect(circuit_breaker.circuit_broken?).to be_falsey - end - end + expect(value_from_redis(:failure_count).to_i).to eq(0) + expect(value_from_redis(:last_failure)).to be_empty + expect(circuit_breaker.failure_count).to eq(0) + expect(circuit_breaker.last_failure).to be_nil end - end - describe "storage_available?" do - context 'the storage is available' do - it 'tracks that the storage was accessible an raises the error' do - expect(circuit_breaker).to receive(:track_storage_accessible) - - circuit_breaker.storage_available? - end + it 'only accessibility check once' do + expect(Gitlab::Git::Storage::ForkedStorageCheck) + .to receive(:storage_available?).once.and_call_original - it 'only performs the check once' do - expect(Gitlab::Git::Storage::ForkedStorageCheck) - .to receive(:storage_available?).once.and_call_original - - 2.times { circuit_breaker.storage_available? } - end + 2.times { circuit_breaker.perform { '' } } end - context 'storage is not available' do - let(:storage_name) { 'broken' } - - it 'tracks that the storage was inaccessible' do - expect(circuit_breaker).to receive(:track_storage_inaccessible) + context 'with the feature disabled' do + it 'returns the block without checking accessibility' do + stub_feature_flags(git_storage_circuit_breaker: false) - circuit_breaker.storage_available? - end - end - end + expect(circuit_breaker).not_to receive(:circuit_broken?) - describe '#check_storage_accessible!' do - it 'raises an exception with retry time when the circuit is open' do - allow(circuit_breaker).to receive(:circuit_broken?).and_return(true) + result = circuit_breaker.perform { 'hello' } - expect { circuit_breaker.check_storage_accessible! } - .to raise_error do |exception| - expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen) - expect(exception.retry_after).to eq(30) + expect(result).to eq('hello') end end context 'the storage is not available' do let(:storage_name) { 'broken' } - it 'raises an error' do + it 'raises the correct exception' do expect(circuit_breaker).to receive(:track_storage_inaccessible) - expect { circuit_breaker.check_storage_accessible! } + expect { circuit_breaker.perform { '' } } .to raise_error do |exception| expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible) expect(exception.retry_after).to eq(30) end end - end - end - - describe '#track_storage_inaccessible' do - around do |example| - Timecop.freeze { example.run } - end - - it 'records the failure time in redis' do - circuit_breaker.track_storage_inaccessible - - failure_time = value_from_redis(:last_failure) - expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now) - end - - it 'sets the failure time on the breaker without reloading' do - circuit_breaker.track_storage_inaccessible - - expect(circuit_breaker).not_to receive(:get_failure_info) - expect(circuit_breaker.last_failure).to eq(Time.now) - end - - it 'increments the failure count in redis' do - set_in_redis(:failure_count, 10) - - circuit_breaker.track_storage_inaccessible - - expect(value_from_redis(:failure_count).to_i).to be(11) - end - - it 'increments the failure count on the breaker without reloading' do - set_in_redis(:failure_count, 10) - - circuit_breaker.track_storage_inaccessible + it 'tracks that the storage was inaccessible' do + Timecop.freeze do + expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible) - expect(circuit_breaker).not_to receive(:get_failure_info) - expect(circuit_breaker.failure_count).to eq(11) + expect(value_from_redis(:failure_count).to_i).to eq(1) + expect(value_from_redis(:last_failure)).not_to be_empty + expect(circuit_breaker.failure_count).to eq(1) + expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now) + end + end end end - describe '#track_storage_accessible' do - it 'sets the failure count to zero in redis' do - set_in_redis(:failure_count, 10) - - circuit_breaker.track_storage_accessible - - expect(value_from_redis(:failure_count).to_i).to be(0) - end - - it 'sets the failure count to zero on the breaker without reloading' do - set_in_redis(:failure_count, 10) - - circuit_breaker.track_storage_accessible + describe '#circuit_broken?' do + it 'is working when there is no last failure' do + set_in_redis(:last_failure, nil) + set_in_redis(:failure_count, 0) - expect(circuit_breaker).not_to receive(:get_failure_info) - expect(circuit_breaker.failure_count).to eq(0) + expect(circuit_breaker.circuit_broken?).to be_falsey end - it 'removes the last failure time from redis' do - set_in_redis(:last_failure, Time.now.to_i) - - circuit_breaker.track_storage_accessible + it 'is broken when there are too many failures' do + set_in_redis(:last_failure, 1.day.ago.to_f) + set_in_redis(:failure_count, 200) - expect(circuit_breaker).not_to receive(:get_failure_info) - expect(circuit_breaker.last_failure).to be_nil + expect(circuit_breaker.circuit_broken?).to be_truthy end + end - it 'removes the last failure time from the breaker without reloading' do - set_in_redis(:last_failure, Time.now.to_i) - - circuit_breaker.track_storage_accessible + describe '#backing_off?' do + it 'is true when there was a recent failure' do + Timecop.freeze do + set_in_redis(:last_failure, 1.second.ago.to_f) + set_in_redis(:failure_count, 90) - expect(value_from_redis(:last_failure)).to be_empty + expect(circuit_breaker.backing_off?).to be_truthy + end end - it 'wont connect to redis when there are no failures' do - expect(Gitlab::Git::Storage.redis).to receive(:with).once - .and_call_original - expect(circuit_breaker).to receive(:track_storage_accessible) - .and_call_original - - circuit_breaker.track_storage_accessible - end - end + context 'the `failure_wait_time` is set to 0' do + before do + stub_application_setting(circuitbreaker_failure_wait_time: 0) + end - describe '#no_failures?' do - it 'is false when a failure was tracked' do - set_in_redis(:last_failure, Time.now.to_i) - set_in_redis(:failure_count, 1) + it 'is working even when there are failures' do + Timecop.freeze do + set_in_redis(:last_failure, 0.seconds.ago.to_f) + set_in_redis(:failure_count, 90) - expect(circuit_breaker.no_failures?).to be_falsey + expect(circuit_breaker.backing_off?).to be_falsey + end + end end end @@ -351,10 +284,4 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: expect(circuit_breaker.failure_count).to eq(7) end end - - describe '#cache_key' do - it 'includes storage and host' do - expect(circuit_breaker.cache_key).to eq(cache_key) - end - end end diff --git a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb index 7ee6d2f3709..5db37f55e03 100644 --- a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb +++ b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb @@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do ours = described_class.public_instance_methods theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods - # These methods are not part of the public API, but are public to allow the - # CircuitBreaker specs to operate. They should be made private over time. - exceptions = %i[ - cache_key - check_storage_accessible! - no_failures? - storage_available? - track_storage_accessible - track_storage_inaccessible - ] - - expect(theirs - ours).to contain_exactly(*exceptions) + expect(theirs - ours).to be_empty end end |