mirror of
https://gitlab.com/gitlab-org/gitlab-foss.git
synced 2025-08-01 16:04:19 +00:00
72 lines
3.1 KiB
Ruby
72 lines
3.1 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module Gitlab
|
|
module BackgroundMigration
|
|
class DeduplicateLfsObjectsProjects < BatchedMigrationJob
|
|
operation_name :deduplicates_lfs_objects_projects
|
|
feature_category :source_code_management
|
|
|
|
# Temporary class to link AR model to the `lfs_objects_projects` table
|
|
class LfsObjectsProject < ::ApplicationRecord
|
|
include EachBatch
|
|
|
|
self.table_name = 'lfs_objects_projects'
|
|
end
|
|
|
|
def perform
|
|
each_sub_batch do |relation|
|
|
data = duplicates_by_project_id_and_lfs_object_id(relation)
|
|
|
|
next if data.empty?
|
|
|
|
# After plucking the duplicates, build a VALUE list
|
|
id_list = Arel::Nodes::ValuesList.new(data).to_sql
|
|
|
|
# Use the same GROUP BY query as in the MR to properly narrow down the duplicated records.
|
|
# In the previous query we didn't include the repository_type because it is not covered with an index.
|
|
subquery = LfsObjectsProject
|
|
.where("(project_id, lfs_object_id) IN (#{id_list})") # rubocop:disable GitlabSecurity/SqlInjection -- there is no user input given
|
|
.select('project_id, lfs_object_id, repository_type, MAX(id) AS max_id')
|
|
.group('project_id, lfs_object_id, repository_type')
|
|
.having('COUNT(*) > 1')
|
|
|
|
join_query = <<~SQL.squish
|
|
INNER JOIN (#{subquery.to_sql}) AS duplicates
|
|
ON lfs_objects_projects.project_id = duplicates.project_id
|
|
AND lfs_objects_projects.lfs_object_id = duplicates.lfs_object_id
|
|
AND lfs_objects_projects.repository_type IS NOT DISTINCT FROM duplicates.repository_type
|
|
SQL
|
|
|
|
duplicated_lfs_objects_projects = LfsObjectsProject.joins(join_query).where.not(
|
|
'lfs_objects_projects.id = duplicates.max_id'
|
|
)
|
|
|
|
LfsObjectsProject.where(id: duplicated_lfs_objects_projects.select(:id)).delete_all
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def duplicates_by_project_id_and_lfs_object_id(relation)
|
|
# Select project_id and lfs_object_id pairs which have duplicates.
|
|
inner_query = LfsObjectsProject
|
|
.select('1')
|
|
.from('lfs_objects_projects lop')
|
|
.where('lop.project_id = lfs_objects_projects.project_id')
|
|
.where('lop.lfs_object_id = lfs_objects_projects.lfs_object_id')
|
|
.limit(2)
|
|
|
|
count_query = LfsObjectsProject.select('COUNT(*) AS count').from("(#{inner_query.to_sql}) cnt")
|
|
|
|
cte = Gitlab::SQL::CTE.new(:distinct_values, relation.select(:project_id, :lfs_object_id).distinct)
|
|
|
|
# Limit count to determine if there is a duplicate, we don't need to load all duplicated rows
|
|
# (only 2 rows are enough for a project_id, lfs_object_id) pair
|
|
cte.apply_to(LfsObjectsProject.where({}))
|
|
.where("(#{count_query.to_sql}) = 2") # rubocop:disable GitlabSecurity/SqlInjection -- there is no user input given
|
|
.pluck(:project_id, :lfs_object_id)
|
|
end
|
|
end
|
|
end
|
|
end
|