Files
gitlab-foss/scripts/sql_fingerprint_extractor.rb
2025-04-28 15:09:57 +00:00

134 lines
3.8 KiB
Ruby
Executable File

#!/usr/bin/env ruby
# frozen_string_literal: true
require 'json'
require 'zlib'
require 'logger'
require 'stringio'
require 'rubygems/package'
# SQLFingerprintExtractor extracts and processes SQL query fingerprints
# from various file formats including NDJSON, gzipped NDJSON, and tar.gz archives
class SQLFingerprintExtractor
attr_reader :logger
def initialize(logger = nil)
@logger = logger || Logger.new($stdout)
end
# Extract fingerprints from a local file (compressed or uncompressed)
# Returns an array of query objects with fingerprints
def extract_queries_from_file(file_path)
logger.info "Extracting queries from file: #{file_path}" if logger
queries = []
begin
if file_path.end_with?('.gz')
Zlib::GzipReader.open(file_path) do |gz|
gz.each_line do |line|
process_json_line(line, queries)
end
end
else
File.foreach(file_path) do |line|
process_json_line(line, queries)
end
end
rescue StandardError => e
logger.warn "Warning: Error reading file: #{e.message}" if logger
return [] # Return empty array on error
end
logger.info "Extracted #{queries.size} queries from file: #{file_path}" if logger
queries
end
# Extract just the fingerprint strings from a file
# Returns a Set of fingerprint strings
def extract_fingerprints_from_file(file_path)
queries = extract_queries_from_file(file_path)
Set.new(queries.filter_map { |q| q['fingerprint'] })
end
# Extract fingerprints from a tar.gz content
# Returns a Set of fingerprint strings
def extract_from_tar_gz(content, max_size_mb = 250)
fingerprints = Set.new
max_size = max_size_mb * (1024**2) # guardrail to prevent issues if unexpectedly large
begin
io = StringIO.new(content)
gz = Zlib::GzipReader.new(io)
tar = Gem::Package::TarReader.new(gz)
tar&.each do |entry|
# Now looking for raw fingerprint files (any text file)
next unless entry.file? && !entry.directory?
# Check file size before reading
if entry.header.size > max_size
logger.error(
"File too large: #{entry.header.size / (1024**2)}MB exceeds limit #{max_size_mb}MB"
)
return fingerprints
end
entry_content = entry.read
entry_content.each_line do |line|
fingerprint = line.strip
fingerprints.add(fingerprint) unless fingerprint.empty?
end
end
rescue StandardError => e
logger.error "Error processing tar.gz: #{e.message}"
return Set.new
end
fingerprints
end
# Write a set of fingerprints to file
def write_fingerprints_to_file(fingerprints, output_file)
File.open(output_file, 'w') do |f|
fingerprints.each { |fp| f.puts(fp) }
end
logger.info "Wrote #{fingerprints.size} fingerprints to #{output_file}" if logger
end
private
def process_json_line(line, queries)
data = JSON.parse(line)
queries << data if data['fingerprint']
rescue JSON::ParserError
# Skip invalid JSON
end
end
# Command-line script functionality
if __FILE__ == $PROGRAM_NAME
if ARGV.size < 2
puts "Usage: #{$PROGRAM_NAME} <input_file> <output_file>"
exit 1
end
input_file = ARGV[0]
output_file = ARGV[1]
logger = Logger.new($stdout)
unless File.exist?(input_file)
logger.error "Input file not found - #{input_file}"
exit 1
end
begin
extractor = SQLFingerprintExtractor.new
fingerprints = extractor.extract_fingerprints_from_file(input_file)
extractor.write_fingerprints_to_file(fingerprints, output_file)
logger.info "Successfully extracted #{fingerprints.size} fingerprints to #{output_file}"
rescue StandardError => e
logger.error e.message.to_s
exit 1
end
end