Files
gitlab-foss/lib/gitlab/metrics/samplers/concurrency_limit_sampler.rb
2025-06-26 15:10:12 +00:00

108 lines
4.0 KiB
Ruby

# frozen_string_literal: true
module Gitlab
module Metrics
module Samplers
class ConcurrencyLimitSampler < BaseSampler
include ExclusiveLeaseGuard
# Scrape timing explanation:
# - Prometheus scrapes occur every 1 minute
# - Our sampler lease lasts for 5 minutes
# - After writing metrics, we sleep for 30s until lease expires before resetting the metrics to 0.
DEFAULT_SAMPLING_INTERVAL_SECONDS = 30
LEASE_TIMEOUT = 300
# The sleep ensures that:
# 1. Process A runs sampler and takes the lease
# 2. Other processes running sampler will not be able to take the lease, so they will be no-ops
# 3. While the lease still exists (for 5 minutes):
# a. The sampler writes the metrics
# b. The sampler sleeps for 30s
# c. We hope scrapes happen here (occur every minute), so we expect 4 or 5 scrapes for 1 sampler
# 4. Reset metrics to 0
# 5. The first other process picks up the lease, goto 1
#
# Therefore we ensure that on every scrape, 1 process would report the correct data
# while the process that previously held lease report 0.
def sample
try_obtain_lease do
# Keep reporting the metrics while the lease is valid
# to ensure we have continuous data. Also check if the
# sampler is still running because a SIGTERM will cause
# the sleep to be interrupted and the loop to run again
# until the condition is false.
while running && exclusive_lease.same_uuid?
report_metrics
# Ensure that we don't sleep if the state changed
# after reporting metrics.
break unless running
Kernel.sleep(DEFAULT_SAMPLING_INTERVAL_SECONDS)
end
# Reset metrics to ensure only the next sample reports fresh data.
reset_metrics
end
end
private
# Used by ExclusiveLeaseGuard
def lease_timeout
LEASE_TIMEOUT
end
def report_metrics
worker_maps.workers.each do |w|
queue_size = concurrent_limit_service.queue_size(w.name)
report_queue_size(w, queue_size)
concurrent_worker_count = concurrent_limit_service.concurrent_worker_count(w.name)
report_concurrent_workers(w, concurrent_worker_count)
limit = worker_maps.limit_for(worker: w)
report_limit(w, limit)
end
end
def reset_metrics
worker_maps.workers.each do |w|
report_queue_size(w, 0)
report_concurrent_workers(w, 0)
report_limit(w, 0)
end
end
def worker_maps
Gitlab::SidekiqMiddleware::ConcurrencyLimit::WorkersMap
end
def concurrent_limit_service
Gitlab::SidekiqMiddleware::ConcurrencyLimit::ConcurrencyLimitService
end
def report_queue_size(worker, queue_size)
@queue_size_metric ||= Gitlab::Metrics.gauge(:sidekiq_concurrency_limit_queue_jobs,
'Number of jobs queued by the concurrency limit middleware.')
@queue_size_metric.set({ worker: worker.name, feature_category: worker.get_feature_category }, queue_size)
end
def report_concurrent_workers(worker, concurrent_worker_count)
@concurrency_metric ||= Gitlab::Metrics.gauge(:sidekiq_concurrency_limit_current_concurrent_jobs,
'Current number of concurrent running jobs.')
@concurrency_metric.set({ worker: worker.name, feature_category: worker.get_feature_category },
concurrent_worker_count)
end
def report_limit(worker, limit)
@limit_metric ||= Gitlab::Metrics.gauge(:sidekiq_concurrency_limit_max_concurrent_jobs,
'Max number of concurrent running jobs.')
@limit_metric.set({ worker: worker.name, feature_category: worker.get_feature_category }, limit)
end
end
end
end
end