#!/usr/bin/env php countChunks($archiveUid); if ($totalChunks === 0) { echo 'Chunk re-embedding completed.' . PHP_EOL; echo 'Archive filter: ' . ($archiveUid ?: '(all chunks)') . PHP_EOL; echo 'Mode: nothing-to-do' . PHP_EOL; echo 'Eligible chunks: 0' . PHP_EOL; exit(0); } $mode = $forceReset ? 'reset' : 'resume'; $resetCount = $mode === 'reset' ? $repository->resetAllChunksToPending($archiveUid) : $repository->resetRecoverableChunksToPending($archiveUid); $batchCount = 0; $processedArchives = []; $progress = completedCount($repository, $archiveUid); echo 'Progress granularity: embedding request batches (up to ' . $batchSize . ' chunks each)' . PHP_EOL; renderProgress($progress, $totalChunks, 'Re-embedding'); while (true) { $archiveUids = $repository->queuePendingArchiveTasks(100); if ($archiveUid !== null && trim($archiveUid) !== '') { $archiveUids = array_values(array_filter($archiveUids, static fn (string $uid): bool => $uid === trim($archiveUid))); } if ($archiveUids === []) { break; } foreach ($archiveUids as $uid) { $processedChunkCount = $handler->handle([ 'task_type' => 'embedding', 'target_type' => 'archive', 'target_uid' => $uid, 'attempt' => 1, ]); if ($processedChunkCount <= 0) { continue; } $batchCount++; $processedArchives[] = $uid; $progress = completedCount($repository, $archiveUid); renderProgress($progress, $totalChunks, 'Re-embedding'); fwrite(STDOUT, PHP_EOL . sprintf( 'Batch #%d archive=%s chunks=%d progress=%d/%d%s', $batchCount, $uid, $processedChunkCount, $progress, $totalChunks, PHP_EOL )); } } $embeddedChunks = $repository->countChunksByStatuses([EmbeddingStatus::EMBEDDED], $archiveUid); $terminalFailures = $repository->countChunksByStatuses([EmbeddingStatus::FAILED_TERMINAL], $archiveUid); renderProgress($embeddedChunks + $terminalFailures, $totalChunks, 'Re-embedding', true); echo 'Chunk re-embedding completed.' . PHP_EOL; echo 'Archive filter: ' . ($archiveUid ?: '(all chunks)') . PHP_EOL; echo 'Mode: ' . $mode . ($forceReset ? ' (--reset)' : '') . PHP_EOL; echo 'Eligible chunks: ' . $totalChunks . PHP_EOL; echo 'Embedding batch size: ' . $batchSize . PHP_EOL; echo 'Reset chunks: ' . $resetCount . PHP_EOL; echo 'Processed archives: ' . count(array_unique($processedArchives)) . PHP_EOL; echo 'Processed batches: ' . $batchCount . PHP_EOL; echo 'Embedded chunk rows now marked embedded: ' . $embeddedChunks . PHP_EOL; echo 'Terminal failures: ' . $terminalFailures . PHP_EOL; if ($processedArchives !== []) { echo 'Archives: ' . implode(', ', $processedArchives) . PHP_EOL; } echo 'Next step: refresh OpenSearch vectors with `php scripts/reindex_opensearch.php' . ($archiveUid ? ' --archive_uid=' . $archiveUid : '') . ($forceReset ? ' --reset' : '') . '`' . PHP_EOL; } catch (Throwable $exception) { fwrite(STDERR, PHP_EOL . $exception::class . ': ' . $exception->getMessage() . PHP_EOL); exit(1); } function completedCount(ChunkEmbeddingRepository $repository, ?string $archiveUid): int { return $repository->countChunksByStatuses([ EmbeddingStatus::EMBEDDED, EmbeddingStatus::FAILED_TERMINAL, ], $archiveUid); } function renderProgress(int $done, int $total, string $label, bool $final = false): void { $total = max(1, $total); $done = max(0, min($done, $total)); $width = 32; $filled = (int) floor(($done / $total) * $width); $bar = str_repeat('=', $filled) . str_repeat(' ', max(0, $width - $filled)); $percent = str_pad(number_format(($done / $total) * 100, 1), 5, ' ', STR_PAD_LEFT); $line = sprintf("\r%s [%s] %s%% (%d/%d)", $label, $bar, $percent, $done, $total); fwrite(STDOUT, $line); if ($final || $done >= $total) { fwrite(STDOUT, PHP_EOL); } }