Files
gitlab-foss/lib/gitlab/database/namespace_each_batch.rb
2024-09-04 09:14:46 +00:00

224 lines
8.3 KiB
Ruby

# frozen_string_literal: true
module Gitlab
module Database
# This class implements an iterator over the namespace hierarchy which uses a recursive
# depth-first algorithm.
# You can read more about the algorithm here:
# https://docs.gitlab.com/ee/development/database/poc_tree_iterator.html
#
# With the class, you can iterate over the whole hierarchy including subgroups and project namespaces
# or just iterate over the subgroups.
#
# Usage:
#
# # To invoke the iterator, you can take any group id.
# # Build the cursor object that will be used for tracking our position in the tree hierarchy.
# cursor = { current_id: 9970, depth: [9970] }
#
# # Instantiate the object.
# iterator = Gitlab::Database::NamespaceEachBatch.new(namespace_class: Namespace, cursor: cursor)
#
# iterator.each_batch(of: 100) do |ids|
# # return namespace ids which can be Group id or Namespaces::ProjectNamespace id
# puts ids
# end
#
# # When you need to break out of the iteration and continue later, you can yield the cursor as a second parameter:
# iterator.each_batch(of: 100) do |ids, new_cursor|
# save_cursor(new_cursor) && break if limit_reached?
# puts ids
# end
#
# You can build a new iterator later and resume the processing.
#
# # Building an iterator that only returns groups:
# iterator = Gitlab::Database::NamespaceEachBatch.new(namespace_class: Group, cursor: cursor)
#
class NamespaceEachBatch
PROJECTIONS = %w[current_id depth ids count index].freeze
def initialize(namespace_class:, cursor:)
@namespace_class = namespace_class
set_cursor!(cursor)
end
def each_batch(of: 500)
current_cursor = cursor.dup
first_iteration = true
loop do
new_cursor, ids = load_batch(cursor: current_cursor, of: of, first_iteration: first_iteration)
break if new_cursor.nil?
first_iteration = false
current_cursor = new_cursor
yield ids, new_cursor
break if new_cursor[:depth].empty?
end
end
private
attr_reader :namespace_class, :cursor, :namespace_id
def load_batch(cursor:, of:, first_iteration: false)
recursive_scope = build_recursive_query(cursor, of, first_iteration)
row = Namespace
.select(*PROJECTIONS)
.from(recursive_scope.arel.as(Namespace.table_name)).order(count: :desc)
.limit(1)
.first
return [] unless row
[{ current_id: row[:current_id], depth: row[:depth] }, row[:ids]]
end
# rubocop: disable Style/AsciiComments -- Rendering a graph
# The depth-first algorithm is implemented here. Consider the following group hierarchy:
#
# ┌──┐
# │10│
# ┌────┴──┴────┐
# │ │
# ┌─┴┐ ┌┴─┐
# │41│ │72│
# └─┬┘ └──┘
# │
# ┌─┴┐
# ┌────┤32├─────┐
# │ └─┬┘ │
# │ │ │
# ┌─┴┐ ┌─┴┐ ┌┴─┐
# │11│ │12│ │18│
# └──┘ └──┘ └──┘
#
# 1. Start with node 10 and look up the left-hand child nodes until reaching the leaf. (walk_down)
# 2. While walking down, record the depth in an array and also store them in the ids array.
# 3. depth: 10, 41, 32, 11 | ids: 10, 41, 32, 11
# 4. Start collecting the ids by looking at the nodes on the deepest level. (next_elements)
# 5. This gives us the rest of the nodes on the same level (parent_id = 32 AND id > 11)
# 6. depth: 10, 41, 32, 11 | ids: 10, 41, 32, 11, 12, 18
# 7. When done, move one level up and pop the last value from the depth. (up_one_level)
# 8. depth: 10, 41, 32 | ids: 10, 41, 32, 11, 12, 18
# 9. Do the same, look at the nodes on the same level: no records, 32 was already collected
# 10. depth: 10, 41, 32 | ids: 10, 41, 32, 11, 12, 18
# 11. Move one level up again and look at the nodes on the same level.
# 12. depth: 10, 41 | ids: 10, 41, 32, 11, 12, 18, 72
# 13. Move one level up again, we reached the root node, iteration is done.
# 14. depth: 10 | ids: 10, 41, 32, 11, 12, 18, 72
#
# By tracking the currently accessed node and the depth we can stop and restore the processing of
# the hierarchy at any point.
#
# rubocop: enable Style/AsciiComments
def build_recursive_query(cursor, of, first_iteration)
ids = first_iteration ? cursor[:current_id] : ''
recursive_cte = Gitlab::SQL::RecursiveCTE.new(:result,
union_args: {
remove_order: false,
remove_duplicates: false
})
recursive_cte << base_namespace_class.select(
Arel.sql("#{cursor[:current_id]}::bigint").as('current_id'),
Arel.sql("ARRAY[#{cursor[:depth].join(',')}]::bigint[]").as('depth'),
Arel.sql("ARRAY[#{ids}]::bigint[]").as('ids'),
Arel.sql('1::bigint AS count'),
Arel.sql('0::bigint AS index')
).from('(VALUES (1)) AS initializer_row')
.where_exists(namespace_exists_query)
cte = Gitlab::SQL::CTE.new(:cte, base_namespace_class.select('result.*').from('result'))
union_query = base_namespace_class.with(cte.to_arel).from_union(
walk_down,
next_elements,
up_one_level,
remove_duplicates: false,
remove_order: false
).select(*PROJECTIONS).order(base_namespace_class.arel_table[:index].asc).limit(1)
recursive_cte << union_query
base_namespace_class.with
.recursive(recursive_cte.to_arel)
.from(recursive_cte.alias_to(namespace_class.arel_table))
.select(*PROJECTIONS)
.limit(of + 1)
end
def namespace_exists_query
Namespace.where(id: cursor[:current_id])
end
def walk_down
lateral_query = namespace_class
.select(:id)
.where('parent_id = cte.current_id')
.order(:id)
.limit(1)
base_namespace_class.select(
base_namespace_class.arel_table[:id].as('current_id'),
Arel.sql("cte.depth || #{base_namespace_table}.id::bigint").as('depth'),
Arel.sql("cte.ids || #{base_namespace_table}.id::bigint").as('ids'),
Arel.sql('cte.count + 1').as('count'),
Arel.sql('1::bigint AS index')
).from("cte, LATERAL (#{lateral_query.to_sql}) #{base_namespace_table}")
end
def next_elements
lateral_query = namespace_class
.select(:id)
.where("#{base_namespace_table}.parent_id = cte.depth[array_length(cte.depth, 1) - 1]")
.where("#{base_namespace_table}.id > cte.depth[array_length(cte.depth, 1)]")
.order(:id)
.limit(1)
base_namespace_class.select(
base_namespace_class.arel_table[:id].as('current_id'),
Arel.sql("cte.depth[:array_length(cte.depth, 1) - 1] || #{base_namespace_table}.id::bigint").as('depth'),
Arel.sql("cte.ids || #{base_namespace_table}.id::bigint").as('ids'),
Arel.sql('cte.count + 1').as('count'),
Arel.sql('2::bigint AS index')
).from("cte, LATERAL (#{lateral_query.to_sql}) #{base_namespace_table}")
end
def up_one_level
Namespace.select(
Arel.sql('cte.current_id').as('current_id'),
Arel.sql('cte.depth[:array_length(cte.depth, 1) - 1]').as('depth'),
Arel.sql('cte.ids').as('ids'),
Arel.sql('cte.count + 1').as('count'),
Arel.sql('3::bigint AS index')
).from('cte')
.where("cte.depth <> '{}'")
.limit(1)
end
def base_namespace_class
Namespace
end
def base_namespace_table
Namespace.quoted_table_name
end
def set_cursor!(original_cursor)
raise ArgumentError unless original_cursor[:depth].is_a?(Array)
@cursor = {
current_id: Integer(original_cursor[:current_id]),
depth: original_cursor[:depth].map { |value| Integer(value) }
}
end
end
end
end