diff options
author | Stan Hu <stanhu@gmail.com> | 2018-12-19 01:14:06 +0000 |
---|---|---|
committer | Stan Hu <stanhu@gmail.com> | 2018-12-19 01:14:06 +0000 |
commit | 5a866cbd23ff49c60781cf7f8db546049e7964fb (patch) | |
tree | fe25a918d4729f8f23d12dccbb17f93b42e8c440 | |
parent | 518a751c1a5eb78065bcbd7b9623bd5ab9822141 (diff) | |
parent | 4a132952db680e84f66014ebb4e68e713fa31d35 (diff) | |
download | gitlab-ce-5a866cbd23ff49c60781cf7f8db546049e7964fb.tar.gz |
Merge branch '54781-backfill-project-repositories-for-projects-on-legacy-storage' into 'master'
Backfill project_repositories for legacy storage projects
See merge request gitlab-org/gitlab-ce!23920
10 files changed, 423 insertions, 208 deletions
diff --git a/db/post_migrate/20181218192239_backfill_project_repositories_for_legacy_storage_projects.rb b/db/post_migrate/20181218192239_backfill_project_repositories_for_legacy_storage_projects.rb new file mode 100644 index 00000000000..42f96750789 --- /dev/null +++ b/db/post_migrate/20181218192239_backfill_project_repositories_for_legacy_storage_projects.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +class BackfillProjectRepositoriesForLegacyStorageProjects < ActiveRecord::Migration[5.0] + include Gitlab::Database::MigrationHelpers + + DOWNTIME = false + BATCH_SIZE = 1_000 + DELAY_INTERVAL = 5.minutes + MIGRATION = 'BackfillLegacyProjectRepositories' + + disable_ddl_transaction! + + class Project < ActiveRecord::Base + include EachBatch + + self.table_name = 'projects' + end + + def up + queue_background_migration_jobs_by_range_at_intervals(Project, MIGRATION, DELAY_INTERVAL) + end + + def down + # no-op: since there could have been existing rows before the migration do not remove anything + end +end diff --git a/db/schema.rb b/db/schema.rb index 008bff49a2b..604ed1cd6b0 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 20181212104941) do +ActiveRecord::Schema.define(version: 20181218192239) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" diff --git a/lib/gitlab/background_migration/backfill_hashed_project_repositories.rb b/lib/gitlab/background_migration/backfill_hashed_project_repositories.rb index 2f76f2f7434..a6194616663 100644 --- a/lib/gitlab/background_migration/backfill_hashed_project_repositories.rb +++ b/lib/gitlab/background_migration/backfill_hashed_project_repositories.rb @@ -2,132 +2,13 @@ module Gitlab module BackgroundMigration - # Class that will create fill the project_repositories table - # for all projects that are on hashed storage and an entry is - # is missing in this table. - class BackfillHashedProjectRepositories - # Shard model - class Shard < ActiveRecord::Base - self.table_name = 'shards' - end - - # Class that will find or create the shard by name. - # There is only a small set of shards, which would - # not change quickly, so look them up from memory - # instead of hitting the DB each time. - class ShardFinder - def find_shard_id(name) - shard_id = shards.fetch(name, nil) - return shard_id if shard_id.present? - - Shard.transaction(requires_new: true) do - create!(name) - end - rescue ActiveRecord::RecordNotUnique - reload! - retry - end - - private - - def create!(name) - Shard.create!(name: name).tap { |shard| @shards[name] = shard.id } - end - - def shards - @shards ||= reload! - end - - def reload! - @shards = Hash[*Shard.all.map { |shard| [shard.name, shard.id] }.flatten] - end - end - - # ProjectRegistry model - class ProjectRepository < ActiveRecord::Base - self.table_name = 'project_repositories' - - belongs_to :project, inverse_of: :project_repository - end - - # Project model - class Project < ActiveRecord::Base - self.table_name = 'projects' - - HASHED_PATH_PREFIX = '@hashed' - - HASHED_STORAGE_FEATURES = { - repository: 1, - attachments: 2 - }.freeze - - has_one :project_repository, inverse_of: :project - - class << self - def on_hashed_storage - where(Project.arel_table[:storage_version] - .gteq(HASHED_STORAGE_FEATURES[:repository])) - end - - def without_project_repository - joins(left_outer_join_project_repository) - .where(ProjectRepository.arel_table[:project_id].eq(nil)) - end - - def left_outer_join_project_repository - projects_table = Project.arel_table - repository_table = ProjectRepository.arel_table - - projects_table - .join(repository_table, Arel::Nodes::OuterJoin) - .on(projects_table[:id].eq(repository_table[:project_id])) - .join_sources - end - end - - def hashed_storage? - self.storage_version && self.storage_version >= 1 - end - - def hashed_disk_path - "#{HASHED_PATH_PREFIX}/#{disk_hash[0..1]}/#{disk_hash[2..3]}/#{disk_hash}" - end - - def disk_hash - @disk_hash ||= Digest::SHA2.hexdigest(id.to_s) - end - end - - def perform(start_id, stop_id) - Gitlab::Database.bulk_insert(:project_repositories, project_repositories(start_id, stop_id)) - end - + # Class that will fill the project_repositories table for projects that + # are on hashed storage and an entry is is missing in this table. + class BackfillHashedProjectRepositories < BackfillProjectRepositories private - def project_repositories(start_id, stop_id) + def projects Project.on_hashed_storage - .without_project_repository - .where(id: start_id..stop_id) - .map { |project| build_attributes_for_project(project) } - .compact - end - - def build_attributes_for_project(project) - return unless project.hashed_storage? - - { - project_id: project.id, - shard_id: find_shard_id(project.repository_storage), - disk_path: project.hashed_disk_path - } - end - - def find_shard_id(repository_storage) - shard_finder.find_shard_id(repository_storage) - end - - def shard_finder - @shard_finder ||= ShardFinder.new end end end diff --git a/lib/gitlab/background_migration/backfill_legacy_project_repositories.rb b/lib/gitlab/background_migration/backfill_legacy_project_repositories.rb new file mode 100644 index 00000000000..6dc92672929 --- /dev/null +++ b/lib/gitlab/background_migration/backfill_legacy_project_repositories.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Class that will fill the project_repositories table for projects that + # are on legacy storage and an entry is is missing in this table. + class BackfillLegacyProjectRepositories < BackfillProjectRepositories + private + + def projects + Project.with_parent.on_legacy_storage + end + end + end +end diff --git a/lib/gitlab/background_migration/backfill_project_repositories.rb b/lib/gitlab/background_migration/backfill_project_repositories.rb new file mode 100644 index 00000000000..aaf520d70f6 --- /dev/null +++ b/lib/gitlab/background_migration/backfill_project_repositories.rb @@ -0,0 +1,219 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Class that will create fill the project_repositories table + # for projects an entry is is missing in this table. + class BackfillProjectRepositories + OrphanedNamespaceError = Class.new(StandardError) + + # Shard model + class Shard < ActiveRecord::Base + self.table_name = 'shards' + end + + # Class that will find or create the shard by name. + # There is only a small set of shards, which would + # not change quickly, so look them up from memory + # instead of hitting the DB each time. + class ShardFinder + def find_shard_id(name) + shard_id = shards.fetch(name, nil) + return shard_id if shard_id.present? + + Shard.transaction(requires_new: true) do + create!(name) + end + rescue ActiveRecord::RecordNotUnique + reload! + retry + end + + private + + def create!(name) + Shard.create!(name: name).tap { |shard| @shards[name] = shard.id } + end + + def shards + @shards ||= reload! + end + + def reload! + @shards = Hash[*Shard.all.map { |shard| [shard.name, shard.id] }.flatten] + end + end + + module Storage + # Class that returns the disk path for a project using hashed storage + class HashedProject + attr_accessor :project + + ROOT_PATH_PREFIX = '@hashed' + + def initialize(project) + @project = project + end + + def disk_path + "#{ROOT_PATH_PREFIX}/#{disk_hash[0..1]}/#{disk_hash[2..3]}/#{disk_hash}" + end + + def disk_hash + @disk_hash ||= Digest::SHA2.hexdigest(project.id.to_s) + end + end + + # Class that returns the disk path for a project using legacy storage + class LegacyProject + attr_accessor :project + + def initialize(project) + @project = project + end + + def disk_path + project.full_path + end + end + end + + # Concern used by Project and Namespace to determine the full route to the project + module Routable + extend ActiveSupport::Concern + + def full_path + @full_path ||= build_full_path + end + + def build_full_path + return path unless has_parent? + + raise OrphanedNamespaceError if parent.nil? + + parent.full_path + '/' + path + end + + def has_parent? + read_attribute(association(:parent).reflection.foreign_key) + end + end + + # Namespace model. + class Namespace < ActiveRecord::Base + self.table_name = 'namespaces' + self.inheritance_column = nil + + include Routable + + belongs_to :parent, class_name: 'Namespace', inverse_of: 'namespaces' + + has_many :projects, inverse_of: :parent + has_many :namespaces, inverse_of: :parent + end + + # ProjectRegistry model + class ProjectRepository < ActiveRecord::Base + self.table_name = 'project_repositories' + + belongs_to :project, inverse_of: :project_repository + end + + # Project model + class Project < ActiveRecord::Base + self.table_name = 'projects' + + include Routable + + HASHED_STORAGE_FEATURES = { + repository: 1, + attachments: 2 + }.freeze + + scope :with_parent, -> { includes(:parent) } + + belongs_to :parent, class_name: 'Namespace', foreign_key: :namespace_id, inverse_of: 'projects' + + has_one :project_repository, inverse_of: :project + + delegate :disk_path, to: :storage + + class << self + def on_hashed_storage + where(Project.arel_table[:storage_version] + .gteq(HASHED_STORAGE_FEATURES[:repository])) + end + + def on_legacy_storage + where(Project.arel_table[:storage_version].eq(nil) + .or(Project.arel_table[:storage_version].eq(0))) + end + + def without_project_repository + joins(left_outer_join_project_repository) + .where(ProjectRepository.arel_table[:project_id].eq(nil)) + end + + def left_outer_join_project_repository + projects_table = Project.arel_table + repository_table = ProjectRepository.arel_table + + projects_table + .join(repository_table, Arel::Nodes::OuterJoin) + .on(projects_table[:id].eq(repository_table[:project_id])) + .join_sources + end + end + + def storage + @storage ||= + if hashed_storage? + Storage::HashedProject.new(self) + else + Storage::LegacyProject.new(self) + end + end + + def hashed_storage? + self.storage_version && + self.storage_version >= HASHED_STORAGE_FEATURES[:repository] + end + end + + def perform(start_id, stop_id) + Gitlab::Database.bulk_insert(:project_repositories, project_repositories(start_id, stop_id)) + end + + private + + def projects + raise NotImplementedError, + "#{self.class} does not implement #{__method__}" + end + + def project_repositories(start_id, stop_id) + projects + .without_project_repository + .where(id: start_id..stop_id) + .map { |project| build_attributes_for_project(project) } + .compact + end + + def build_attributes_for_project(project) + { + project_id: project.id, + shard_id: find_shard_id(project.repository_storage), + disk_path: project.disk_path + } + end + + def find_shard_id(repository_storage) + shard_finder.find_shard_id(repository_storage) + end + + def shard_finder + @shard_finder ||= ShardFinder.new + end + end + end +end diff --git a/spec/factories/project_repositories.rb b/spec/factories/project_repositories.rb new file mode 100644 index 00000000000..39e8ea2e11e --- /dev/null +++ b/spec/factories/project_repositories.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +FactoryBot.define do + factory :project_repository do + project + + after(:build) do |project_repository, _| + project_repository.shard_name = project_repository.project.repository_storage + project_repository.disk_path = project_repository.project.disk_path + end + end +end diff --git a/spec/lib/gitlab/background_migration/backfill_hashed_project_repositories_spec.rb b/spec/lib/gitlab/background_migration/backfill_hashed_project_repositories_spec.rb index b6c1edbbf8b..e802613490b 100644 --- a/spec/lib/gitlab/background_migration/backfill_hashed_project_repositories_spec.rb +++ b/spec/lib/gitlab/background_migration/backfill_hashed_project_repositories_spec.rb @@ -3,88 +3,5 @@ require 'spec_helper' describe Gitlab::BackgroundMigration::BackfillHashedProjectRepositories, :migration, schema: 20181130102132 do - let(:namespaces) { table(:namespaces) } - let(:project_repositories) { table(:project_repositories) } - let(:projects) { table(:projects) } - let(:shards) { table(:shards) } - let(:group) { namespaces.create!(name: 'foo', path: 'foo') } - let(:shard) { shards.create!(name: 'default') } - - describe described_class::ShardFinder do - describe '#find_shard_id' do - it 'creates a new shard when it does not exist yet' do - expect { subject.find_shard_id('other') }.to change(shards, :count).by(1) - end - - it 'returns the shard when it exists' do - shards.create(id: 5, name: 'other') - - shard_id = subject.find_shard_id('other') - - expect(shard_id).to eq(5) - end - - it 'only queries the database once to retrieve shards' do - subject.find_shard_id('default') - - expect { subject.find_shard_id('default') }.not_to exceed_query_limit(0) - end - end - end - - describe described_class::Project do - describe '.on_hashed_storage' do - it 'finds projects with repository on hashed storage' do - projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1) - projects.create!(id: 2, name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 2) - projects.create!(id: 3, name: 'baz', path: 'baz', namespace_id: group.id, storage_version: 0) - projects.create!(id: 4, name: 'zoo', path: 'zoo', namespace_id: group.id, storage_version: nil) - - expect(described_class.on_hashed_storage.pluck(:id)).to match_array([1, 2]) - end - end - - describe '.without_project_repository' do - it 'finds projects which do not have a projects_repositories entry' do - projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id) - projects.create!(id: 2, name: 'bar', path: 'bar', namespace_id: group.id) - project_repositories.create!(project_id: 2, disk_path: '@phony/foo/bar', shard_id: shard.id) - - expect(described_class.without_project_repository.pluck(:id)).to contain_exactly(1) - end - end - end - - describe '#perform' do - it 'creates a project_repository row for projects on hashed storage that need one' do - projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1) - projects.create!(id: 2, name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 2) - - expect { described_class.new.perform(1, projects.last.id) }.to change(project_repositories, :count).by(2) - end - - it 'does nothing for projects on hashed storage that have already a project_repository row' do - projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1) - project_repositories.create!(project_id: 1, disk_path: '@phony/foo/bar', shard_id: shard.id) - - expect { described_class.new.perform(1, projects.last.id) }.not_to change(project_repositories, :count) - end - - it 'does nothing for projects on legacy storage' do - projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 0) - - expect { described_class.new.perform(1, projects.last.id) }.not_to change(project_repositories, :count) - end - - it 'inserts rows in a single query' do - projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1, repository_storage: shard.name) - - control_count = ActiveRecord::QueryRecorder.new { described_class.new.perform(1, projects.last.id) } - - projects.create!(name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 1, repository_storage: shard.name) - projects.create!(name: 'zoo', path: 'zoo', namespace_id: group.id, storage_version: 1, repository_storage: shard.name) - - expect { described_class.new.perform(1, projects.last.id) }.not_to exceed_query_limit(control_count) - end - end + it_behaves_like 'backfill migration for project repositories', :hashed end diff --git a/spec/lib/gitlab/background_migration/backfill_legacy_project_repositories_spec.rb b/spec/lib/gitlab/background_migration/backfill_legacy_project_repositories_spec.rb new file mode 100644 index 00000000000..ae4b53d62e6 --- /dev/null +++ b/spec/lib/gitlab/background_migration/backfill_legacy_project_repositories_spec.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe Gitlab::BackgroundMigration::BackfillLegacyProjectRepositories, :migration, schema: 20181218192239 do + it_behaves_like 'backfill migration for project repositories', :legacy +end diff --git a/spec/lib/gitlab/background_migration/backfill_project_repositories_spec.rb b/spec/lib/gitlab/background_migration/backfill_project_repositories_spec.rb new file mode 100644 index 00000000000..53c071f0268 --- /dev/null +++ b/spec/lib/gitlab/background_migration/backfill_project_repositories_spec.rb @@ -0,0 +1,94 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe Gitlab::BackgroundMigration::BackfillProjectRepositories do + let(:group) { create(:group, name: 'foo', path: 'foo') } + + describe described_class::ShardFinder do + let(:shard) { create(:shard, name: 'default') } + + describe '#find_shard_id' do + it 'creates a new shard when it does not exist yet' do + expect { subject.find_shard_id('other') }.to change(Shard, :count).by(1) + end + + it 'returns the shard when it exists' do + other_shard = create(:shard, name: 'other') + + shard_id = subject.find_shard_id('other') + + expect(shard_id).to eq(other_shard.id) + end + + it 'only queries the database once to retrieve shards' do + subject.find_shard_id('default') + + expect { subject.find_shard_id('default') }.not_to exceed_query_limit(0) + end + end + end + + describe described_class::Project do + let!(:project_hashed_storage_1) { create(:project, name: 'foo', path: 'foo', namespace: group, storage_version: 1) } + let!(:project_hashed_storage_2) { create(:project, name: 'bar', path: 'bar', namespace: group, storage_version: 2) } + let!(:project_legacy_storage_3) { create(:project, name: 'baz', path: 'baz', namespace: group, storage_version: 0) } + let!(:project_legacy_storage_4) { create(:project, name: 'zoo', path: 'zoo', namespace: group, storage_version: nil) } + + describe '.on_hashed_storage' do + it 'finds projects with repository on hashed storage' do + projects = described_class.on_hashed_storage.pluck(:id) + + expect(projects).to match_array([project_hashed_storage_1.id, project_hashed_storage_2.id]) + end + end + + describe '.on_legacy_storage' do + it 'finds projects with repository on legacy storage' do + projects = described_class.on_legacy_storage.pluck(:id) + + expect(projects).to match_array([project_legacy_storage_3.id, project_legacy_storage_4.id]) + end + end + + describe '.without_project_repository' do + it 'finds projects which do not have a projects_repositories entry' do + create(:project_repository, project: project_hashed_storage_1) + create(:project_repository, project: project_legacy_storage_3) + + projects = described_class.without_project_repository.pluck(:id) + + expect(projects).to contain_exactly(project_hashed_storage_2.id, project_legacy_storage_4.id) + end + end + + describe '#disk_path' do + context 'for projects on hashed storage' do + it 'returns the correct disk_path' do + project = described_class.find(project_hashed_storage_1.id) + + expect(project.disk_path).to eq(project_hashed_storage_1.disk_path) + end + end + + context 'for projects on legacy storage' do + it 'returns the correct disk_path' do + project = described_class.find(project_legacy_storage_3.id) + + expect(project.disk_path).to eq(project_legacy_storage_3.disk_path) + end + + it 'raises OrphanedNamespaceError when any parent namespace does not exist' do + subgroup = create(:group, parent: group) + project_orphaned_namespace = create(:project, name: 'baz', path: 'baz', namespace: subgroup, storage_version: nil) + subgroup.update_column(:parent_id, Namespace.maximum(:id).succ) + + project = described_class.find(project_orphaned_namespace.id) + + expect { project.disk_path } + .to raise_error(Gitlab::BackgroundMigration::BackfillProjectRepositories::OrphanedNamespaceError) + end + end + end + end +end diff --git a/spec/support/shared_examples/lib/gitlab/background_migration/backfill_project_repositories_examples.rb b/spec/support/shared_examples/lib/gitlab/background_migration/backfill_project_repositories_examples.rb new file mode 100644 index 00000000000..1f688c0f9d3 --- /dev/null +++ b/spec/support/shared_examples/lib/gitlab/background_migration/backfill_project_repositories_examples.rb @@ -0,0 +1,44 @@ +shared_examples 'backfill migration for project repositories' do |storage| + describe '#perform' do + let(:storage_versions) { storage == :legacy ? [nil, 0] : [1, 2] } + let(:storage_version) { storage_versions.first } + let(:namespaces) { table(:namespaces) } + let(:project_repositories) { table(:project_repositories) } + let(:projects) { table(:projects) } + let(:shards) { table(:shards) } + let(:group) { namespaces.create!(name: 'foo', path: 'foo') } + let(:shard) { shards.create!(name: 'default') } + + it "creates a project_repository row for projects on #{storage} storage that needs one" do + storage_versions.each_with_index do |storage_version, index| + projects.create!(name: "foo-#{index}", path: "foo-#{index}", namespace_id: group.id, storage_version: storage_version) + end + + expect { described_class.new.perform(1, projects.last.id) }.to change(project_repositories, :count).by(2) + end + + it "does nothing for projects on #{storage} storage that have already a project_repository row" do + projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id, storage_version: storage_version) + project_repositories.create!(project_id: 1, disk_path: 'phony/foo/bar', shard_id: shard.id) + + expect { described_class.new.perform(1, projects.last.id) }.not_to change(project_repositories, :count) + end + + it "does nothing for projects on #{storage == :legacy ? 'hashed' : 'legacy'} storage" do + projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: storage == :legacy ? 1 : nil) + + expect { described_class.new.perform(1, projects.last.id) }.not_to change(project_repositories, :count) + end + + it 'inserts rows in a single query' do + projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: storage_version, repository_storage: shard.name) + + control_count = ActiveRecord::QueryRecorder.new { described_class.new.perform(1, projects.last.id) } + + projects.create!(name: 'bar', path: 'bar', namespace_id: group.id, storage_version: storage_version, repository_storage: shard.name) + projects.create!(name: 'zoo', path: 'zoo', namespace_id: group.id, storage_version: storage_version, repository_storage: shard.name) + + expect { described_class.new.perform(1, projects.last.id) }.not_to exceed_query_limit(control_count) + end + end +end |