proofdb/scripts/reembed_chunks.php
2026-05-11 15:23:34 +08:00

136 lines
4.9 KiB
PHP

#!/usr/bin/env php
<?php
use app\service\Embedding\ChunkEmbeddingHandler;
use app\service\Embedding\ChunkEmbeddingRepository;
use app\service\Embedding\EmbeddingStatus;
require __DIR__ . '/../vendor/autoload.php';
require __DIR__ . '/../support/bootstrap.php';
$archiveUid = null;
$forceReset = false;
foreach (array_slice($argv, 1) as $argument) {
if (str_starts_with($argument, '--archive_uid=')) {
$archiveUid = substr($argument, strlen('--archive_uid='));
continue;
}
if ($argument === '--reset') {
$forceReset = true;
}
}
$repository = new ChunkEmbeddingRepository();
$handler = new ChunkEmbeddingHandler();
$batchSize = max(1, (int) config('LLMapi.embedding.batch_size', 32));
try {
$totalChunks = $repository->countChunks($archiveUid);
if ($totalChunks === 0) {
echo 'Chunk re-embedding completed.' . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: '(all chunks)') . PHP_EOL;
echo 'Mode: nothing-to-do' . PHP_EOL;
echo 'Eligible chunks: 0' . PHP_EOL;
exit(0);
}
$mode = $forceReset ? 'reset' : 'resume';
$resetCount = $mode === 'reset'
? $repository->resetAllChunksToPending($archiveUid)
: $repository->resetRecoverableChunksToPending($archiveUid);
$batchCount = 0;
$processedArchives = [];
$progress = completedCount($repository, $archiveUid);
echo 'Progress granularity: embedding request batches (up to ' . $batchSize . ' chunks each)' . PHP_EOL;
renderProgress($progress, $totalChunks, 'Re-embedding');
while (true) {
$archiveUids = $repository->queuePendingArchiveTasks(100);
if ($archiveUid !== null && trim($archiveUid) !== '') {
$archiveUids = array_values(array_filter($archiveUids, static fn (string $uid): bool => $uid === trim($archiveUid)));
}
if ($archiveUids === []) {
break;
}
foreach ($archiveUids as $uid) {
$processedChunkCount = $handler->handle([
'task_type' => 'embedding',
'target_type' => 'archive',
'target_uid' => $uid,
'attempt' => 1,
]);
if ($processedChunkCount <= 0) {
continue;
}
$batchCount++;
$processedArchives[] = $uid;
$progress = completedCount($repository, $archiveUid);
renderProgress($progress, $totalChunks, 'Re-embedding');
fwrite(STDOUT, PHP_EOL . sprintf(
'Batch #%d archive=%s chunks=%d progress=%d/%d%s',
$batchCount,
$uid,
$processedChunkCount,
$progress,
$totalChunks,
PHP_EOL
));
}
}
$embeddedChunks = $repository->countChunksByStatuses([EmbeddingStatus::EMBEDDED], $archiveUid);
$terminalFailures = $repository->countChunksByStatuses([EmbeddingStatus::FAILED_TERMINAL], $archiveUid);
renderProgress($embeddedChunks + $terminalFailures, $totalChunks, 'Re-embedding', true);
echo 'Chunk re-embedding completed.' . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: '(all chunks)') . PHP_EOL;
echo 'Mode: ' . $mode . ($forceReset ? ' (--reset)' : '') . PHP_EOL;
echo 'Eligible chunks: ' . $totalChunks . PHP_EOL;
echo 'Embedding batch size: ' . $batchSize . PHP_EOL;
echo 'Reset chunks: ' . $resetCount . PHP_EOL;
echo 'Processed archives: ' . count(array_unique($processedArchives)) . PHP_EOL;
echo 'Processed batches: ' . $batchCount . PHP_EOL;
echo 'Embedded chunk rows now marked embedded: ' . $embeddedChunks . PHP_EOL;
echo 'Terminal failures: ' . $terminalFailures . PHP_EOL;
if ($processedArchives !== []) {
echo 'Archives: ' . implode(', ', $processedArchives) . PHP_EOL;
}
echo 'Next step: refresh OpenSearch vectors with `php scripts/reindex_opensearch.php'
. ($archiveUid ? ' --archive_uid=' . $archiveUid : '')
. ($forceReset ? ' --reset' : '')
. '`' . PHP_EOL;
} catch (Throwable $exception) {
fwrite(STDERR, PHP_EOL . $exception::class . ': ' . $exception->getMessage() . PHP_EOL);
exit(1);
}
function completedCount(ChunkEmbeddingRepository $repository, ?string $archiveUid): int
{
return $repository->countChunksByStatuses([
EmbeddingStatus::EMBEDDED,
EmbeddingStatus::FAILED_TERMINAL,
], $archiveUid);
}
function renderProgress(int $done, int $total, string $label, bool $final = false): void
{
$total = max(1, $total);
$done = max(0, min($done, $total));
$width = 32;
$filled = (int) floor(($done / $total) * $width);
$bar = str_repeat('=', $filled) . str_repeat(' ', max(0, $width - $filled));
$percent = str_pad(number_format(($done / $total) * 100, 1), 5, ' ', STR_PAD_LEFT);
$line = sprintf("\r%s [%s] %s%% (%d/%d)", $label, $bar, $percent, $done, $total);
fwrite(STDOUT, $line);
if ($final || $done >= $total) {
fwrite(STDOUT, PHP_EOL);
}
}