diff options
| -rw-r--r-- | app/models/lfs_object.rb | 4 | ||||
| -rw-r--r-- | changelogs/unreleased/poc-upload-hashing-path.yml (renamed from ee/changelogs/unreleased/poc-upload-hashing-path.yml) | 0 | ||||
| -rw-r--r-- | doc/administration/raketasks/check.md | 27 | ||||
| -rw-r--r-- | lib/gitlab/verify/batch_verifier.rb | 64 | ||||
| -rw-r--r-- | lib/gitlab/verify/lfs_objects.rb | 27 | ||||
| -rw-r--r-- | lib/gitlab/verify/rake_task.rb | 53 | ||||
| -rw-r--r-- | lib/gitlab/verify/uploads.rb | 27 | ||||
| -rw-r--r-- | lib/tasks/gitlab/lfs/check.rake | 8 | ||||
| -rw-r--r-- | lib/tasks/gitlab/lfs/migrate.rake (renamed from lib/tasks/gitlab/lfs.rake) | 0 | ||||
| -rw-r--r-- | spec/factories/lfs_objects.rb | 10 | ||||
| -rw-r--r-- | spec/lib/gitlab/verify/lfs_objects_spec.rb | 51 | ||||
| -rw-r--r-- | spec/lib/gitlab/verify/uploads_spec.rb | 60 | ||||
| -rw-r--r-- | spec/support/gitlab_verify.rb | 45 | ||||
| -rw-r--r-- | spec/tasks/gitlab/lfs/check_rake_spec.rb | 28 | ||||
| -rw-r--r-- | spec/tasks/gitlab/lfs/migrate_rake_spec.rb (renamed from spec/tasks/gitlab/lfs_rake_spec.rb) | 2 | ||||
| -rw-r--r-- | spec/tasks/gitlab/uploads/check_rake_spec.rb | 28 | 
16 files changed, 423 insertions, 11 deletions
| diff --git a/app/models/lfs_object.rb b/app/models/lfs_object.rb index 6c78c086662..64e88d5a6a2 100644 --- a/app/models/lfs_object.rb +++ b/app/models/lfs_object.rb @@ -24,4 +24,8 @@ class LfsObject < ActiveRecord::Base          .where(lfs_objects_projects: { id: nil })          .destroy_all    end + +  def self.calculate_oid(path) +    Digest::SHA256.file(path).hexdigest +  end  end diff --git a/ee/changelogs/unreleased/poc-upload-hashing-path.yml b/changelogs/unreleased/poc-upload-hashing-path.yml index 7970405bea1..7970405bea1 100644 --- a/ee/changelogs/unreleased/poc-upload-hashing-path.yml +++ b/changelogs/unreleased/poc-upload-hashing-path.yml diff --git a/doc/administration/raketasks/check.md b/doc/administration/raketasks/check.md index d1ed152b58c..d73d9422d2c 100644 --- a/doc/administration/raketasks/check.md +++ b/doc/administration/raketasks/check.md @@ -78,34 +78,41 @@ Example output:  ## Uploaded Files Integrity -The uploads check Rake task will loop through all uploads in the database -and run two checks to determine the integrity of each file: +Various types of file can be uploaded to a GitLab installation by users. +Checksums are generated and stored in the database upon upload, and integrity +checks using those checksums can be run. These checks also detect missing files. -1. Check if the file exist on the file system. -1. Check if the checksum of the file on the file system matches the checksum in the database. +Currently, integrity checks are supported for the following types of file: + +* LFS objects +* User uploads  **Omnibus Installation**  ``` +sudo gitlab-rake gitlab:lfs:check  sudo gitlab-rake gitlab:uploads:check  ```  **Source Installation**  ```bash +sudo -u git -H bundle exec rake gitlab:lfs:check RAILS_ENV=production  sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production  ``` -This task also accepts some environment variables which you can use to override +These tasks also accept some environment variables which you can use to override  certain values: -Variable | Type | Description --------- | ---- | ----------- -`BATCH`   | integer  | Specifies the size of the batch. Defaults to 200. -`ID_FROM` | integer  | Specifies the ID to start from, inclusive of the value. -`ID_TO`   | integer  | Specifies the ID value to end at, inclusive of the value. +Variable  | Type    | Description +--------- | ------- | ----------- +`BATCH`   | integer | Specifies the size of the batch. Defaults to 200. +`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value. +`ID_TO`   | integer | Specifies the ID value to end at, inclusive of the value. +`VERBOSE` | boolean | Causes failures to be listed individually, rather than being summarized.  ```bash +sudo gitlab-rake gitlab:lfs:check BATCH=100 ID_FROM=50 ID_TO=250  sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250  ``` diff --git a/lib/gitlab/verify/batch_verifier.rb b/lib/gitlab/verify/batch_verifier.rb new file mode 100644 index 00000000000..1ef369a4b67 --- /dev/null +++ b/lib/gitlab/verify/batch_verifier.rb @@ -0,0 +1,64 @@ +module Gitlab +  module Verify +    class BatchVerifier +      attr_reader :batch_size, :start, :finish + +      def initialize(batch_size:, start: nil, finish: nil) +        @batch_size = batch_size +        @start = start +        @finish = finish +      end + +      # Yields a Range of IDs and a Hash of failed verifications (object => error) +      def run_batches(&blk) +        relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches +          range = relation.first.id..relation.last.id +          failures = run_batch(relation) + +          yield(range, failures) +        end +      end + +      def name +        raise NotImplementedError.new +      end + +      def describe(_object) +        raise NotImplementedError.new +      end + +      private + +      def run_batch(relation) +        relation.map { |upload| verify(upload) }.compact.to_h +      end + +      def verify(object) +        expected = expected_checksum(object) +        actual = actual_checksum(object) + +        raise 'Checksum missing' unless expected.present? +        raise 'Checksum mismatch' unless expected == actual + +        nil +      rescue => err +        [object, err] +      end + +      # This should return an ActiveRecord::Relation suitable for calling #in_batches on +      def relation +        raise NotImplementedError.new +      end + +      # The checksum we expect the object to have +      def expected_checksum(_object) +        raise NotImplementedError.new +      end + +      # The freshly-recalculated checksum of the object +      def actual_checksum(_object) +        raise NotImplementedError.new +      end +    end +  end +end diff --git a/lib/gitlab/verify/lfs_objects.rb b/lib/gitlab/verify/lfs_objects.rb new file mode 100644 index 00000000000..970e2a7b718 --- /dev/null +++ b/lib/gitlab/verify/lfs_objects.rb @@ -0,0 +1,27 @@ +module Gitlab +  module Verify +    class LfsObjects < BatchVerifier +      def name +        'LFS objects' +      end + +      def describe(object) +        "LFS object: #{object.oid}" +      end + +      private + +      def relation +        LfsObject.with_files_stored_locally +      end + +      def expected_checksum(lfs_object) +        lfs_object.oid +      end + +      def actual_checksum(lfs_object) +        LfsObject.calculate_oid(lfs_object.file.path) +      end +    end +  end +end diff --git a/lib/gitlab/verify/rake_task.rb b/lib/gitlab/verify/rake_task.rb new file mode 100644 index 00000000000..dd138e6b92b --- /dev/null +++ b/lib/gitlab/verify/rake_task.rb @@ -0,0 +1,53 @@ +module Gitlab +  module Verify +    class RakeTask +      def self.run!(verify_kls) +        verifier = verify_kls.new( +          batch_size: ENV.fetch('BATCH', 200).to_i, +          start: ENV['ID_FROM'], +          finish: ENV['ID_TO'] +        ) + +        verbose = Gitlab::Utils.to_boolean(ENV['VERBOSE']) + +        new(verifier, verbose).run! +      end + +      attr_reader :verifier, :output + +      def initialize(verifier, verbose) +        @verifier = verifier +        @verbose = verbose +      end + +      def run! +        say "Checking integrity of #{verifier.name}" + +        verifier.run_batches { |*args| run_batch(*args) } + +        say 'Done!' +      end + +      def verbose? +        !!@verbose +      end + +      private + +      def say(text) +        puts(text) # rubocop:disable Rails/Output +      end + +      def run_batch(range, failures) +        status_color = failures.empty? ? :green : :red +        say "- #{range}: Failures: #{failures.count}".color(status_color) + +        return unless verbose? + +        failures.each do |object, error| +          say "  - #{verifier.describe(object)}: #{error.inspect}".color(:red) +        end +      end +    end +  end +end diff --git a/lib/gitlab/verify/uploads.rb b/lib/gitlab/verify/uploads.rb new file mode 100644 index 00000000000..0ffa71a6d72 --- /dev/null +++ b/lib/gitlab/verify/uploads.rb @@ -0,0 +1,27 @@ +module Gitlab +  module Verify +    class Uploads < BatchVerifier +      def name +        'Uploads' +      end + +      def describe(object) +        "Upload: #{object.id}" +      end + +      private + +      def relation +        Upload.with_files_stored_locally +      end + +      def expected_checksum(upload) +        upload.checksum +      end + +      def actual_checksum(upload) +        Upload.hexdigest(upload.absolute_path) +      end +    end +  end +end diff --git a/lib/tasks/gitlab/lfs/check.rake b/lib/tasks/gitlab/lfs/check.rake new file mode 100644 index 00000000000..869463d4e5d --- /dev/null +++ b/lib/tasks/gitlab/lfs/check.rake @@ -0,0 +1,8 @@ +namespace :gitlab do +  namespace :lfs do +    desc 'GitLab | LFS | Check integrity of uploaded LFS objects' +    task check: :environment do +      Gitlab::Verify::RakeTask.run!(Gitlab::Verify::LfsObjects) +    end +  end +end diff --git a/lib/tasks/gitlab/lfs.rake b/lib/tasks/gitlab/lfs/migrate.rake index a45e5ca91e0..a45e5ca91e0 100644 --- a/lib/tasks/gitlab/lfs.rake +++ b/lib/tasks/gitlab/lfs/migrate.rake diff --git a/spec/factories/lfs_objects.rb b/spec/factories/lfs_objects.rb index 8eb709022ce..eaf3a4ed497 100644 --- a/spec/factories/lfs_objects.rb +++ b/spec/factories/lfs_objects.rb @@ -9,4 +9,14 @@ FactoryBot.define do    trait :with_file do      file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") }    end + +  # The uniqueness constraint means we can't use the correct OID for all LFS +  # objects, so the test needs to decide which (if any) object gets it +  trait :correct_oid do +    oid 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75' +  end + +  trait :object_storage do +    file_store { LfsObjectUploader::Store::REMOTE } +  end  end diff --git a/spec/lib/gitlab/verify/lfs_objects_spec.rb b/spec/lib/gitlab/verify/lfs_objects_spec.rb new file mode 100644 index 00000000000..0f890e2c7ce --- /dev/null +++ b/spec/lib/gitlab/verify/lfs_objects_spec.rb @@ -0,0 +1,51 @@ +require 'spec_helper' + +describe Gitlab::Verify::LfsObjects do +  include GitlabVerifyHelpers + +  it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do +    let!(:objects) { create_list(:lfs_object, 3, :with_file) } +  end + +  describe '#run_batches' do +    let(:failures) { collect_failures } +    let(:failure) { failures[lfs_object] } + +    let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) } + +    it 'passes LFS objects with the correct file' do +      expect(failures).to eq({}) +    end + +    it 'fails LFS objects with a missing file' do +      FileUtils.rm_f(lfs_object.file.path) + +      expect(failures.keys).to contain_exactly(lfs_object) +      expect(failure).to be_a(Errno::ENOENT) +      expect(failure.to_s).to include(lfs_object.file.path) +    end + +    it 'fails LFS objects with a mismatched oid' do +      File.truncate(lfs_object.file.path, 0) + +      expect(failures.keys).to contain_exactly(lfs_object) +      expect(failure.to_s).to include('Checksum mismatch') +    end + +    context 'with remote files' do +      before do +        stub_lfs_object_storage +      end + +      it 'skips LFS objects in object storage' do +        local_failure = create(:lfs_object) +        create(:lfs_object, :object_storage) + +        failures = {} +        described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) } + +        expect(failures.keys).to contain_exactly(local_failure) +      end +    end +  end +end diff --git a/spec/lib/gitlab/verify/uploads_spec.rb b/spec/lib/gitlab/verify/uploads_spec.rb new file mode 100644 index 00000000000..85768308edc --- /dev/null +++ b/spec/lib/gitlab/verify/uploads_spec.rb @@ -0,0 +1,60 @@ +require 'spec_helper' + +describe Gitlab::Verify::Uploads do +  include GitlabVerifyHelpers + +  it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do +    let(:projects) { create_list(:project, 3, :with_avatar) } +    let!(:objects) { projects.flat_map(&:uploads) } +  end + +  describe '#run_batches' do +    let(:project) { create(:project, :with_avatar) } +    let(:failures) { collect_failures } +    let(:failure) { failures[upload] } + +    let!(:upload) { project.uploads.first } + +    it 'passes uploads with the correct file' do +      expect(failures).to eq({}) +    end + +    it 'fails uploads with a missing file' do +      FileUtils.rm_f(upload.absolute_path) + +      expect(failures.keys).to contain_exactly(upload) +      expect(failure).to be_a(Errno::ENOENT) +      expect(failure.to_s).to include(upload.absolute_path) +    end + +    it 'fails uploads with a mismatched checksum' do +      upload.update!(checksum: 'something incorrect') + +      expect(failures.keys).to contain_exactly(upload) +      expect(failure.to_s).to include('Checksum mismatch') +    end + +    it 'fails uploads with a missing precalculated checksum' do +      upload.update!(checksum: '') + +      expect(failures.keys).to contain_exactly(upload) +      expect(failure.to_s).to include('Checksum missing') +    end + +    context 'with remote files' do +      before do +        stub_uploads_object_storage(AvatarUploader) +      end + +      it 'skips uploads in object storage' do +        local_failure = create(:upload) +        create(:upload, :object_storage) + +        failures = {} +        described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) } + +        expect(failures.keys).to contain_exactly(local_failure) +      end +    end +  end +end diff --git a/spec/support/gitlab_verify.rb b/spec/support/gitlab_verify.rb new file mode 100644 index 00000000000..13e2e37624d --- /dev/null +++ b/spec/support/gitlab_verify.rb @@ -0,0 +1,45 @@ +RSpec.shared_examples 'Gitlab::Verify::BatchVerifier subclass' do +  describe 'batching' do +    let(:first_batch) { objects[0].id..objects[0].id } +    let(:second_batch) { objects[1].id..objects[1].id } +    let(:third_batch) { objects[2].id..objects[2].id } + +    it 'iterates through objects in batches' do +      expect(collect_ranges).to eq([first_batch, second_batch, third_batch]) +    end + +    it 'allows the starting ID to be specified' do +      expect(collect_ranges(start: second_batch.first)).to eq([second_batch, third_batch]) +    end + +    it 'allows the finishing ID to be specified' do +      expect(collect_ranges(finish: second_batch.last)).to eq([first_batch, second_batch]) +    end +  end +end + +module GitlabVerifyHelpers +  def collect_ranges(args = {}) +    verifier = described_class.new(args.merge(batch_size: 1)) + +    collect_results(verifier).map { |range, _| range } +  end + +  def collect_failures +    verifier = described_class.new(batch_size: 1) + +    out = {} + +    collect_results(verifier).map { |_, failures| out.merge!(failures) } + +    out +  end + +  def collect_results(verifier) +    out = [] + +    verifier.run_batches { |*args| out << args } + +    out +  end +end diff --git a/spec/tasks/gitlab/lfs/check_rake_spec.rb b/spec/tasks/gitlab/lfs/check_rake_spec.rb new file mode 100644 index 00000000000..2610edf8bac --- /dev/null +++ b/spec/tasks/gitlab/lfs/check_rake_spec.rb @@ -0,0 +1,28 @@ +require 'rake_helper' + +describe 'gitlab:lfs rake tasks' do +  describe 'check' do +    let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) } + +    before do +      Rake.application.rake_require('tasks/gitlab/lfs/check') +      stub_env('VERBOSE' => 'true') +    end + +    it 'outputs the integrity check for each batch' do +      expect { run_rake_task('gitlab:lfs:check') }.to output(/Failures: 0/).to_stdout +    end + +    it 'errors out about missing files on the file system' do +      FileUtils.rm_f(lfs_object.file.path) + +      expect { run_rake_task('gitlab:lfs:check') }.to output(/No such file.*#{Regexp.quote(lfs_object.file.path)}/).to_stdout +    end + +    it 'errors out about invalid checksum' do +      File.truncate(lfs_object.file.path, 0) + +      expect { run_rake_task('gitlab:lfs:check') }.to output(/Checksum mismatch/).to_stdout +    end +  end +end diff --git a/spec/tasks/gitlab/lfs_rake_spec.rb b/spec/tasks/gitlab/lfs/migrate_rake_spec.rb index f1b677bd6ee..66d1a192a96 100644 --- a/spec/tasks/gitlab/lfs_rake_spec.rb +++ b/spec/tasks/gitlab/lfs/migrate_rake_spec.rb @@ -2,7 +2,7 @@ require 'rake_helper'  describe 'gitlab:lfs namespace rake task' do    before :all do -    Rake.application.rake_require 'tasks/gitlab/lfs' +    Rake.application.rake_require 'tasks/gitlab/lfs/migrate'    end    describe 'migrate' do diff --git a/spec/tasks/gitlab/uploads/check_rake_spec.rb b/spec/tasks/gitlab/uploads/check_rake_spec.rb new file mode 100644 index 00000000000..5d597c66133 --- /dev/null +++ b/spec/tasks/gitlab/uploads/check_rake_spec.rb @@ -0,0 +1,28 @@ +require 'rake_helper' + +describe 'gitlab:uploads rake tasks' do +  describe 'check' do +    let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) } + +    before do +      Rake.application.rake_require('tasks/gitlab/uploads/check') +      stub_env('VERBOSE' => 'true') +    end + +    it 'outputs the integrity check for each batch' do +      expect { run_rake_task('gitlab:uploads:check') }.to output(/Failures: 0/).to_stdout +    end + +    it 'errors out about missing files on the file system' do +      missing_upload = create(:upload) + +      expect { run_rake_task('gitlab:uploads:check') }.to output(/No such file.*#{Regexp.quote(missing_upload.absolute_path)}/).to_stdout +    end + +    it 'errors out about invalid checksum' do +      upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e') + +      expect { run_rake_task('gitlab:uploads:check') }.to output(/Checksum mismatch/).to_stdout +    end +  end +end | 
