Files
nextcloud-recognize/lib/Service/FaceClusterAnalyzer.php
Marcel Klehr 408f41227c fix: Fix psalm issues
Signed-off-by: Marcel Klehr <mklehr@gmx.net>
2025-09-08 10:15:33 +02:00

281 lines
9.7 KiB
PHP

<?php
/*
* Copyright (c) 2022 The Recognize contributors.
* This file is licensed under the Affero General Public License version 3 or later. See the COPYING file.
*/
declare(strict_types=1);
namespace OCA\Recognize\Service;
use \OCA\Recognize\Vendor\Rubix\ML\Datasets\Labeled;
use \OCA\Recognize\Vendor\Rubix\ML\Kernels\Distance\Euclidean;
use OCA\Recognize\Clustering\HDBSCAN;
use OCA\Recognize\Db\FaceCluster;
use OCA\Recognize\Db\FaceClusterMapper;
use OCA\Recognize\Db\FaceDetection;
use OCA\Recognize\Db\FaceDetectionMapper;
final class FaceClusterAnalyzer {
public const MIN_DATASET_SIZE = 120;
public const MIN_DETECTION_SIZE = 0.03;
public const MIN_CLUSTER_SEPARATION = 0.35;
public const MAX_CLUSTER_EDGE_LENGTH = 0.5;
public const DIMENSIONS = 128;
public const MAX_OVERLAP_NEW_CLUSTER = 0.1;
public const MIN_OVERLAP_EXISTING_CLUSTER = 0.5;
private FaceDetectionMapper $faceDetections;
private FaceClusterMapper $faceClusters;
private Logger $logger;
private int $minDatasetSize = self::MIN_DATASET_SIZE;
private SettingsService $settingsService;
public function __construct(FaceDetectionMapper $faceDetections, FaceClusterMapper $faceClusters, Logger $logger, SettingsService $settingsService) {
$this->faceDetections = $faceDetections;
$this->faceClusters = $faceClusters;
$this->logger = $logger;
$this->settingsService = $settingsService;
}
public function setMinDatasetSize(int $minSize) : void {
$this->minDatasetSize = $minSize;
}
/**
* @throws \OCP\DB\Exception
* @throws \JsonException
*/
public function calculateClusters(string $userId, int $batchSize = 0): void {
$this->logger->debug('ClusterDebug: Retrieving face detections for user ' . $userId);
if ($batchSize === 0) {
ini_set('memory_limit', '-1');
}
$sampledDetections = [];
$existingClusters = $this->faceClusters->findByUserId($userId);
/** @var array<int,int> $maxVotesByCluster */
$maxVotesByCluster = [];
foreach ($existingClusters as $existingCluster) {
$sampled = $this->faceDetections->findClusterSample($existingCluster->getId(), $this->getReferenceSampleSize(count($existingClusters)));
$sampledDetections = array_merge($sampledDetections, $sampled);
$maxVotesByCluster[$existingCluster->getId()] = count($sampled);
}
if ($batchSize > 0) {
$rejectedDetections = $this->faceDetections->sampleRejectedDetectionsByUserId($userId, $this->getRejectSampleSize($batchSize), self::MIN_DETECTION_SIZE, self::MIN_DETECTION_SIZE);
$requestedFreshDetectionCount = max($batchSize - count($rejectedDetections) - count($sampledDetections), 500);
$freshDetections = $this->faceDetections->findUnclusteredByUserId($userId, $requestedFreshDetectionCount, self::MIN_DETECTION_SIZE, self::MIN_DETECTION_SIZE);
} else {
$freshDetections = $this->faceDetections->findUnclusteredByUserId($userId, 0, self::MIN_DETECTION_SIZE, self::MIN_DETECTION_SIZE);
$rejectedDetections = $this->faceDetections->sampleRejectedDetectionsByUserId($userId, $this->getRejectSampleSize(count($freshDetections)), self::MIN_DETECTION_SIZE, self::MIN_DETECTION_SIZE);
}
$unclusteredDetections = array_merge($freshDetections, $rejectedDetections);
$detections = array_merge($unclusteredDetections, $sampledDetections);
if (count($detections) < $this->minDatasetSize || count($freshDetections) === 0) {
$this->logger->debug('ClusterDebug: Not enough face detections found');
return;
}
$this->logger->debug('ClusterDebug: Found ' . count($freshDetections) . " fresh detections. Adding " . count($rejectedDetections). " old detections and " . count($sampledDetections). " sampled detections from already existing clusters. Calculating clusters on " . count($detections) . " detections.");
$dataset = new Labeled(array_map(static function (FaceDetection $detection): array {
return $detection->getVector();
}, $detections), array_combine(array_keys($detections), array_keys($detections)), false);
$dataset->features();
$n = count($detections);
$hdbscan = new HDBSCAN($dataset, $this->getMinClusterSize($n), $this->getMinSampleSize($n));
$numberOfClusteredDetections = 0;
$clusters = $hdbscan->predict(self::MIN_CLUSTER_SEPARATION, self::MAX_CLUSTER_EDGE_LENGTH);
foreach ($clusters as $flatCluster) {
/** @var int[] $detectionKeys */
$detectionKeys = array_keys($flatCluster->getClusterVertices());
$clusterDetections = array_filter($detections, function ($key) use ($detectionKeys) {
return isset($detectionKeys[$key]);
}, ARRAY_FILTER_USE_KEY);
$clusterCentroid = self::calculateCentroidOfDetections($clusterDetections);
$votes = [];
// Let already clustered detections vote which
// clusterId these newly clustered detections get
foreach ($detectionKeys as $detectionKey) {
if ($detectionKey < count($unclusteredDetections)) {
continue;
}
$vote = $detections[$detectionKey]->getClusterId();
if ($vote === null) {
$vote = -1;
}
$votes[] = $vote;
}
$oldClusterId = -1;
if (empty($votes)) {
$overlap = 0.0;
} else {
$votes = array_count_values($votes);
$oldClusterId = array_search(max($votes), $votes);
$overlap = max($votes) / $maxVotesByCluster[$oldClusterId];
}
// If more than X% of already clustered detections are for this, we keep it
if ($overlap > self::MIN_OVERLAP_EXISTING_CLUSTER) {
$clusterId = $oldClusterId;
$cluster = $this->faceClusters->find($clusterId);
} elseif ($overlap < self::MAX_OVERLAP_NEW_CLUSTER) {
// otherwise we create a new cluster
$cluster = new FaceCluster();
$cluster->setTitle('');
$cluster->setUserId($userId);
$this->faceClusters->insert($cluster);
} else {
// this is a shit cluster. Don't add to it.
continue;
}
foreach ($detectionKeys as $detectionKey) {
if ($detectionKey >= count($unclusteredDetections)) {
// This is a sampled, already clustered detection, ignore.
continue;
}
// If threshold is larger than 0 and $clusterCentroid is not the null vector
if ($unclusteredDetections[$detectionKey]->getThreshold() > 0.0 && count(array_filter($clusterCentroid, fn ($el) => $el !== 0.0)) > 0) {
// If a threshold is set for this detection and its vector is farther away from the centroid
// than the threshold, skip assigning this detection to the cluster
$distanceValue = self::distance($clusterCentroid, $unclusteredDetections[$detectionKey]->getVector());
if ($distanceValue >= $unclusteredDetections[$detectionKey]->getThreshold()) {
continue;
}
}
$this->faceDetections->assocWithCluster($unclusteredDetections[$detectionKey], $cluster);
$numberOfClusteredDetections += 1;
}
}
$this->logger->debug('ClusterDebug: Clustering complete. Total num of clustered detections: ' . $numberOfClusteredDetections);
foreach ($unclusteredDetections as $detection) {
if ($detection->getClusterId() === null) {
// This detection was run through clustering but wasn't assigned to any cluster
$detection->setClusterId(-1);
$this->faceDetections->update($detection);
}
}
$this->settingsService->setSetting('clusterFaces.status', 'true');
$this->settingsService->setSetting('clusterFaces.lastRun', (string)time());
}
/**
* @param FaceDetection[] $detections
* @return list<float>
*/
public static function calculateCentroidOfDetections(array $detections): array {
// init 128 dimensional vector
/** @var list<float> $sum */
$sum = [];
for ($i = 0; $i < self::DIMENSIONS; $i++) {
$sum[] = 0.0;
}
if (count($detections) === 0) {
return $sum;
}
foreach ($detections as $detection) {
$sum = array_map(static function (float $el, float $el2): float {
return $el + $el2;
}, $detection->getVector(), $sum);
}
$centroid = array_map(static function (float $el) use ($detections): float {
return $el / (float) count($detections);
}, $sum);
return $centroid;
}
/**
* @param array<FaceDetection> $detections
* @return array<int,FaceDetection[]>
*/
private function findFilesWithDuplicateFaces(array $detections): array {
$files = [];
foreach ($detections as $detection) {
if (!isset($files[$detection->getFileId()])) {
$files[$detection->getFileId()] = [];
}
$files[$detection->getFileId()][] = $detection;
}
/** @var array<int,FaceDetection[]> $filesWithDuplicateFaces */
$filesWithDuplicateFaces = array_filter($files, static function ($detections) {
return count($detections) > 1;
});
return $filesWithDuplicateFaces;
}
private static ?Euclidean $distance;
/**
* @param list<int|float> $v1
* @param list<int|float> $v2
* @return float
*/
private static function distance(array $v1, array $v2): float {
if (!isset(self::$distance)) {
self::$distance = new Euclidean();
}
return self::$distance->compute($v1, $v2);
}
/**
* Hypothesis is that photos per identity scale with ~ n^(1/5) in the total number of photos
* @param int $batchSize
* @return int
*/
private function getMinClusterSize(int $batchSize) : int {
return (int)round(max(2.0, min(5.0, $batchSize ** (1.0 / 4.7))));
}
/**
* We use 4.6 here to have this slightly smaller than MinClusterSize but still scale similarly
* @param int $batchSize
* @return int
*/
private function getMinSampleSize(int $batchSize) : int {
return (int)round(max(2, min(4, $batchSize ** (1.0 / 5.6))));
}
/**
* Grows to ~5000 detections for ~200-800 clusters (detections per cluster drop exponentially)
* and then grows linearly with 5 detections per cluster
* @param int $numberClusters
* @return int
*/
private function getReferenceSampleSize(int $numberClusters) : int {
return (int)round(75.0 * 2.0 ** (-0.007 * $numberClusters) + 5.0);
}
private function getRejectSampleSize(int $batchSize): int {
return (int) min(($batchSize / 4.0), 12.0 * $batchSize ** (0.55)); // I love maths. Slap me.
}
}