Files
gitlab-foss/lib/gitlab/background_migration/backfill_partitioned_uploads.rb
2025-06-26 00:12:56 +00:00

148 lines
6.6 KiB
Ruby

# frozen_string_literal: true
module Gitlab
module BackgroundMigration
class BackfillPartitionedUploads < BatchedMigrationJob
feature_category :database
operation_name :backfill
class Upload < ApplicationRecord
self.table_name = 'uploads_9ba88c4165'
end
def perform
each_sub_batch do |sub_batch|
tables_and_models.each do |model_table, model_name, sources, targets, join_key, db_name|
process_upload_type(sub_batch, model_table, model_name, sources, targets, join_key, db_name)
end
end
end
private
def sharding_key_columns
%w[organization_id namespace_id project_id]
end
def columns
Upload.column_names - sharding_key_columns
end
def tables_and_models
# model_table, model_name, sources, targets, join_key, db_name
[
['abuse_reports', 'AbuseReport', %w[]],
['achievements', 'Achievements::Achievement', %w[namespace_id]],
['ai_vectorizable_files', 'Ai::VectorizableFile', %w[project_id]],
['alert_management_alert_metric_images', 'AlertManagement::MetricImage', %w[project_id]],
['appearances', 'Appearance', %w[]],
['bulk_import_export_uploads', 'BulkImports::ExportUpload', %w[group_id project_id],
%w[namespace_id project_id]],
['design_management_designs_versions', 'DesignManagement::Action', %w[namespace_id]],
['import_export_uploads', 'ImportExportUpload', %w[group_id project_id], %w[namespace_id project_id]],
['issuable_metric_images', 'IssuableMetricImage', %w[namespace_id]],
['namespaces', 'Namespace', %w[id], %w[namespace_id]],
['organization_details', 'Organizations::OrganizationDetail', %w[organization_id], nil, 'organization_id'],
['project_relation_export_uploads', 'Projects::ImportExport::RelationExportUpload', %w[project_id]],
['topics', 'Projects::Topic', %w[organization_id]],
['projects', 'Project', %w[id], %w[project_id]],
['snippets', 'Snippet', %w[organization_id]],
['user_permission_export_uploads', 'UserPermissionExportUpload', %w[]],
['users', 'User', %w[]],
# Sec tables
['dependency_list_exports', 'Dependencies::DependencyListExport', %w[organization_id group_id project_id],
%w[organization_id namespace_id project_id], nil, :sec],
['dependency_list_export_parts', 'Dependencies::DependencyListExport::Part', %w[organization_id], nil, nil,
:sec],
['vulnerability_exports', 'Vulnerabilities::Export', %w[organization_id], nil, nil, :sec],
['vulnerability_export_parts', 'Vulnerabilities::Export::Part', %w[organization_id], nil, nil, :sec],
['vulnerability_remediations', 'Vulnerabilities::Remediation', %w[project_id], nil, nil, :sec],
['vulnerability_archive_exports', 'Vulnerabilities::ArchiveExport', %w[project_id], nil, nil, :sec]
]
end
# Back-fill partitioned table `uploads_9ba88c4165`. For each sub-batch execute an
# upsert query per model_type, joining with the respective model_table.
# This join will exclude uploads belonging to records that no longer exist.
#
# Arguments are:
# sub_batch - batch to operate on.
# model_table - table storing the parent model
# model_name - model class name
# source - columns to source the sharding key values from
# targets - sharding key columns to back-fill
# join_key - column to join with the model table, defaults to id
# db_name - database the model table belongs to
def process_upload_type(sub_batch, model_table, model_name, sources, targets, join_key, db_name)
relation = sub_batch.select(:id, :model_type).limit(sub_batch_size)
targets ||= sources
join_key ||= 'id'
# Columns that will be reset (nullified) as they are not used for sharding keys
reset_columns = sharding_key_columns - targets
# All columns to back-fill
target_columns = (columns + targets + reset_columns).join(', ')
# All columns to source from
source_columns = source_columns_sql(sources, reset_columns)
# For existing records update only sharding key columns (if any)
on_conflict = if targets.any?
"UPDATE SET #{sharding_key_columns.map { |c| "#{c} = EXCLUDED.#{c}" }.join(', ')}"
else
"NOTHING"
end
# For models stored in the Sec database we need to first fetch the values needed,
# and add them to the upsert as CTE
model_values_cte = ""
if db_name == :sec
sec_cte = sec_model_values_cte(sub_batch, model_name, join_key, sources, model_table)
return unless sec_cte
source_columns = source_columns_sql(sources, reset_columns, nullif: true)
model_values_cte = sec_cte
end
upsert = <<~SQL
WITH relation AS MATERIALIZED (#{relation.to_sql}),
filtered_relation AS MATERIALIZED (
SELECT id FROM relation WHERE model_type = '#{model_name}' LIMIT #{sub_batch_size}
)
#{model_values_cte}
INSERT INTO uploads_9ba88c4165 (#{target_columns})
SELECT #{source_columns} FROM uploads
JOIN #{model_table} AS model ON uploads.model_id = model.#{join_key}
WHERE uploads.id IN (SELECT id FROM filtered_relation)
ON CONFLICT ON CONSTRAINT uploads_9ba88c4165_pkey DO #{on_conflict}
SQL
connection.execute(upsert)
end
def source_columns_sql(sources, reset_columns, nullif: false)
(
columns.map { |c| "uploads.#{c}" } +
# Convert -1 back to NULL using NULLIF
sources.map { |c| nullif ? "NULLIF(model.#{c}, -1)" : "model.#{c}" } +
reset_columns.map { 'NULL' }
).join(', ')
end
def sec_model_values_cte(sub_batch, model_name, join_key, sources, model_table)
model_ids = sub_batch.where(model_type: model_name).limit(sub_batch_size).pluck(:model_id)
return unless model_ids.any?
columns = [join_key] + sources
rows = ::SecApplicationRecord.connection.select_rows(
"SELECT #{columns.join(', ')} FROM #{model_table} WHERE #{join_key} IN (#{model_ids.join(', ')})"
)
return unless rows.any?
# replace NULL values with -1 to avoid PG::DatatypeMismatch errors
values = rows.map { |row| "(#{row.map { |v| v.presence || -1 }.join(', ')})" }.join(', ')
", #{model_table}(#{columns.join(', ')}) AS (VALUES #{values})"
end
end
end
end