Files
gitlab-foss/lib/gitlab/background_migration/deduplicate_lfs_objects_projects.rb
2024-10-02 21:13:21 +00:00

72 lines
3.1 KiB
Ruby

# frozen_string_literal: true
module Gitlab
module BackgroundMigration
class DeduplicateLfsObjectsProjects < BatchedMigrationJob
operation_name :deduplicates_lfs_objects_projects
feature_category :source_code_management
# Temporary class to link AR model to the `lfs_objects_projects` table
class LfsObjectsProject < ::ApplicationRecord
include EachBatch
self.table_name = 'lfs_objects_projects'
end
def perform
each_sub_batch do |relation|
data = duplicates_by_project_id_and_lfs_object_id(relation)
next if data.empty?
# After plucking the duplicates, build a VALUE list
id_list = Arel::Nodes::ValuesList.new(data).to_sql
# Use the same GROUP BY query as in the MR to properly narrow down the duplicated records.
# In the previous query we didn't include the repository_type because it is not covered with an index.
subquery = LfsObjectsProject
.where("(project_id, lfs_object_id) IN (#{id_list})") # rubocop:disable GitlabSecurity/SqlInjection -- there is no user input given
.select('project_id, lfs_object_id, repository_type, MAX(id) AS max_id')
.group('project_id, lfs_object_id, repository_type')
.having('COUNT(*) > 1')
join_query = <<~SQL.squish
INNER JOIN (#{subquery.to_sql}) AS duplicates
ON lfs_objects_projects.project_id = duplicates.project_id
AND lfs_objects_projects.lfs_object_id = duplicates.lfs_object_id
AND lfs_objects_projects.repository_type IS NOT DISTINCT FROM duplicates.repository_type
SQL
duplicated_lfs_objects_projects = LfsObjectsProject.joins(join_query).where.not(
'lfs_objects_projects.id = duplicates.max_id'
)
LfsObjectsProject.where(id: duplicated_lfs_objects_projects.select(:id)).delete_all
end
end
private
def duplicates_by_project_id_and_lfs_object_id(relation)
# Select project_id and lfs_object_id pairs which have duplicates.
inner_query = LfsObjectsProject
.select('1')
.from('lfs_objects_projects lop')
.where('lop.project_id = lfs_objects_projects.project_id')
.where('lop.lfs_object_id = lfs_objects_projects.lfs_object_id')
.limit(2)
count_query = LfsObjectsProject.select('COUNT(*) AS count').from("(#{inner_query.to_sql}) cnt")
cte = Gitlab::SQL::CTE.new(:distinct_values, relation.select(:project_id, :lfs_object_id).distinct)
# Limit count to determine if there is a duplicate, we don't need to load all duplicated rows
# (only 2 rows are enough for a project_id, lfs_object_id) pair
cte.apply_to(LfsObjectsProject.where({}))
.where("(#{count_query.to_sql}) = 2") # rubocop:disable GitlabSecurity/SqlInjection -- there is no user input given
.pluck(:project_id, :lfs_object_id)
end
end
end
end