136 lines
4.9 KiB
PHP
136 lines
4.9 KiB
PHP
#!/usr/bin/env php
|
|
<?php
|
|
|
|
use app\service\Embedding\ChunkEmbeddingHandler;
|
|
use app\service\Embedding\ChunkEmbeddingRepository;
|
|
use app\service\Embedding\EmbeddingStatus;
|
|
|
|
require __DIR__ . '/../vendor/autoload.php';
|
|
require __DIR__ . '/../support/bootstrap.php';
|
|
|
|
$archiveUid = null;
|
|
$forceReset = false;
|
|
|
|
foreach (array_slice($argv, 1) as $argument) {
|
|
if (str_starts_with($argument, '--archive_uid=')) {
|
|
$archiveUid = substr($argument, strlen('--archive_uid='));
|
|
continue;
|
|
}
|
|
|
|
if ($argument === '--reset') {
|
|
$forceReset = true;
|
|
}
|
|
}
|
|
|
|
$repository = new ChunkEmbeddingRepository();
|
|
$handler = new ChunkEmbeddingHandler();
|
|
$batchSize = max(1, (int) config('LLMapi.embedding.batch_size', 32));
|
|
|
|
try {
|
|
$totalChunks = $repository->countChunks($archiveUid);
|
|
if ($totalChunks === 0) {
|
|
echo 'Chunk re-embedding completed.' . PHP_EOL;
|
|
echo 'Archive filter: ' . ($archiveUid ?: '(all chunks)') . PHP_EOL;
|
|
echo 'Mode: nothing-to-do' . PHP_EOL;
|
|
echo 'Eligible chunks: 0' . PHP_EOL;
|
|
exit(0);
|
|
}
|
|
|
|
$mode = $forceReset ? 'reset' : 'resume';
|
|
$resetCount = $mode === 'reset'
|
|
? $repository->resetAllChunksToPending($archiveUid)
|
|
: $repository->resetRecoverableChunksToPending($archiveUid);
|
|
|
|
$batchCount = 0;
|
|
$processedArchives = [];
|
|
$progress = completedCount($repository, $archiveUid);
|
|
echo 'Progress granularity: embedding request batches (up to ' . $batchSize . ' chunks each)' . PHP_EOL;
|
|
renderProgress($progress, $totalChunks, 'Re-embedding');
|
|
|
|
while (true) {
|
|
$archiveUids = $repository->queuePendingArchiveTasks(100);
|
|
if ($archiveUid !== null && trim($archiveUid) !== '') {
|
|
$archiveUids = array_values(array_filter($archiveUids, static fn (string $uid): bool => $uid === trim($archiveUid)));
|
|
}
|
|
|
|
if ($archiveUids === []) {
|
|
break;
|
|
}
|
|
|
|
foreach ($archiveUids as $uid) {
|
|
$processedChunkCount = $handler->handle([
|
|
'task_type' => 'embedding',
|
|
'target_type' => 'archive',
|
|
'target_uid' => $uid,
|
|
'attempt' => 1,
|
|
]);
|
|
if ($processedChunkCount <= 0) {
|
|
continue;
|
|
}
|
|
|
|
$batchCount++;
|
|
$processedArchives[] = $uid;
|
|
$progress = completedCount($repository, $archiveUid);
|
|
renderProgress($progress, $totalChunks, 'Re-embedding');
|
|
fwrite(STDOUT, PHP_EOL . sprintf(
|
|
'Batch #%d archive=%s chunks=%d progress=%d/%d%s',
|
|
$batchCount,
|
|
$uid,
|
|
$processedChunkCount,
|
|
$progress,
|
|
$totalChunks,
|
|
PHP_EOL
|
|
));
|
|
}
|
|
}
|
|
|
|
$embeddedChunks = $repository->countChunksByStatuses([EmbeddingStatus::EMBEDDED], $archiveUid);
|
|
$terminalFailures = $repository->countChunksByStatuses([EmbeddingStatus::FAILED_TERMINAL], $archiveUid);
|
|
renderProgress($embeddedChunks + $terminalFailures, $totalChunks, 'Re-embedding', true);
|
|
|
|
echo 'Chunk re-embedding completed.' . PHP_EOL;
|
|
echo 'Archive filter: ' . ($archiveUid ?: '(all chunks)') . PHP_EOL;
|
|
echo 'Mode: ' . $mode . ($forceReset ? ' (--reset)' : '') . PHP_EOL;
|
|
echo 'Eligible chunks: ' . $totalChunks . PHP_EOL;
|
|
echo 'Embedding batch size: ' . $batchSize . PHP_EOL;
|
|
echo 'Reset chunks: ' . $resetCount . PHP_EOL;
|
|
echo 'Processed archives: ' . count(array_unique($processedArchives)) . PHP_EOL;
|
|
echo 'Processed batches: ' . $batchCount . PHP_EOL;
|
|
echo 'Embedded chunk rows now marked embedded: ' . $embeddedChunks . PHP_EOL;
|
|
echo 'Terminal failures: ' . $terminalFailures . PHP_EOL;
|
|
if ($processedArchives !== []) {
|
|
echo 'Archives: ' . implode(', ', $processedArchives) . PHP_EOL;
|
|
}
|
|
echo 'Next step: refresh OpenSearch vectors with `php scripts/reindex_opensearch.php'
|
|
. ($archiveUid ? ' --archive_uid=' . $archiveUid : '')
|
|
. ($forceReset ? ' --reset' : '')
|
|
. '`' . PHP_EOL;
|
|
} catch (Throwable $exception) {
|
|
fwrite(STDERR, PHP_EOL . $exception::class . ': ' . $exception->getMessage() . PHP_EOL);
|
|
exit(1);
|
|
}
|
|
|
|
function completedCount(ChunkEmbeddingRepository $repository, ?string $archiveUid): int
|
|
{
|
|
return $repository->countChunksByStatuses([
|
|
EmbeddingStatus::EMBEDDED,
|
|
EmbeddingStatus::FAILED_TERMINAL,
|
|
], $archiveUid);
|
|
}
|
|
|
|
function renderProgress(int $done, int $total, string $label, bool $final = false): void
|
|
{
|
|
$total = max(1, $total);
|
|
$done = max(0, min($done, $total));
|
|
$width = 32;
|
|
$filled = (int) floor(($done / $total) * $width);
|
|
$bar = str_repeat('=', $filled) . str_repeat(' ', max(0, $width - $filled));
|
|
$percent = str_pad(number_format(($done / $total) * 100, 1), 5, ' ', STR_PAD_LEFT);
|
|
$line = sprintf("\r%s [%s] %s%% (%d/%d)", $label, $bar, $percent, $done, $total);
|
|
fwrite(STDOUT, $line);
|
|
|
|
if ($final || $done >= $total) {
|
|
fwrite(STDOUT, PHP_EOL);
|
|
}
|
|
}
|