proofdb/scripts/reindex_opensearch.php
2026-05-11 15:23:34 +08:00

129 lines
4.6 KiB
PHP

#!/usr/bin/env php
<?php
use app\service\Search\ChunkSearchIndexHandler;
use app\service\Search\ChunkSearchIndexRepository;
use app\service\Search\OpenSearchClientFactory;
use app\service\Search\OpenSearchChunkIndex;
require __DIR__ . '/../vendor/autoload.php';
require __DIR__ . '/../support/bootstrap.php';
$archiveUid = null;
$forceReset = false;
foreach (array_slice($argv, 1) as $argument) {
if (str_starts_with($argument, '--archive_uid=')) {
$archiveUid = substr($argument, strlen('--archive_uid='));
continue;
}
if ($argument === '--reset') {
$forceReset = true;
}
}
$repository = new ChunkSearchIndexRepository();
$handler = new ChunkSearchIndexHandler();
$index = new OpenSearchChunkIndex();
$clientFactory = new OpenSearchClientFactory();
$bulkSize = max(1, (int) config('opensearch.bulk.chunk_size', 500));
try {
$client = $clientFactory->make();
$indexName = config('opensearch.indices.chunks', 'proofdb_chunks');
$indexExists = (bool) $client->indices()->exists(['index' => $indexName]);
$index->ensureExists();
$totalChunks = $repository->countEmbeddedChunks($archiveUid);
if ($totalChunks === 0) {
echo 'OpenSearch reindex completed.' . PHP_EOL;
echo 'Index: ' . $indexName . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: '(all embedded archives)') . PHP_EOL;
echo 'Mode: nothing-to-do' . PHP_EOL;
echo 'Eligible embedded chunks: 0' . PHP_EOL;
exit(0);
}
$mode = $forceReset || !$indexExists ? 'reset' : 'resume';
$resetCount = $mode === 'reset'
? $repository->resetEmbeddedChunksToPending($archiveUid)
: $repository->resetRecoverableChunksToPending($archiveUid);
$batchCount = 0;
$indexedArchives = [];
$progress = $repository->countIndexedChunks($archiveUid);
echo 'Progress granularity: OpenSearch bulk batches (up to ' . $bulkSize . ' chunks each)' . PHP_EOL;
renderProgress($progress, $totalChunks, 'Reindexing');
while (true) {
$archiveUids = $repository->queuePendingArchiveTasks(100);
if ($archiveUids === []) {
break;
}
foreach ($archiveUids as $uid) {
$processedChunkCount = $handler->handle([
'task_type' => 'search_index',
'target_type' => 'archive',
'target_uid' => $uid,
'attempt' => 1,
]);
if ($processedChunkCount <= 0) {
continue;
}
$batchCount++;
$indexedArchives[] = $uid;
$progress = $repository->countIndexedChunks($archiveUid);
renderProgress($progress, $totalChunks, 'Reindexing');
fwrite(STDOUT, PHP_EOL . sprintf(
'Batch #%d archive=%s chunks=%d progress=%d/%d%s',
$batchCount,
$uid,
$processedChunkCount,
$progress,
$totalChunks,
PHP_EOL
));
}
}
$indexedChunks = $repository->countIndexedChunks($archiveUid);
renderProgress($indexedChunks, $totalChunks, 'Reindexing', true);
echo 'OpenSearch reindex completed.' . PHP_EOL;
echo 'Index: ' . $indexName . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: '(all embedded archives)') . PHP_EOL;
echo 'Mode: ' . $mode . ($forceReset ? ' (--reset)' : (!$indexExists ? ' (index was missing)' : '')) . PHP_EOL;
echo 'Eligible embedded chunks: ' . $totalChunks . PHP_EOL;
echo 'OpenSearch bulk size: ' . $bulkSize . PHP_EOL;
echo 'Reset chunks: ' . $resetCount . PHP_EOL;
echo 'Indexed archives: ' . count(array_unique($indexedArchives)) . PHP_EOL;
echo 'Processed batches: ' . $batchCount . PHP_EOL;
echo 'Indexed chunk rows now marked indexed: ' . $indexedChunks . PHP_EOL;
if ($indexedArchives !== []) {
echo 'Archives: ' . implode(', ', $indexedArchives) . PHP_EOL;
}
} catch (Throwable $exception) {
fwrite(STDERR, $exception::class . ': ' . $exception->getMessage() . PHP_EOL);
exit(1);
}
function renderProgress(int $done, int $total, string $label, bool $final = false): void
{
$total = max(1, $total);
$done = max(0, min($done, $total));
$width = 32;
$filled = (int) floor(($done / $total) * $width);
$bar = str_repeat('=', $filled) . str_repeat(' ', max(0, $width - $filled));
$percent = str_pad(number_format(($done / $total) * 100, 1), 5, ' ', STR_PAD_LEFT);
$line = sprintf("\r%s [%s] %s%% (%d/%d)", $label, $bar, $percent, $done, $total);
fwrite(STDOUT, $line);
if ($final || $done >= $total) {
fwrite(STDOUT, PHP_EOL);
}
}