proofdb/scripts/backfill_archive_content.php
2026-05-08 00:05:51 +08:00

118 lines
3.1 KiB
PHP

#!/usr/bin/env php
<?php
use app\service\ArticleImportService;
use support\Db;
require __DIR__ . '/../vendor/autoload.php';
require __DIR__ . '/../support/bootstrap.php';
require __DIR__ . '/../vendor/webman/database/src/support/Db.php';
$archiveUid = null;
$force = false;
$dryRun = false;
foreach (array_slice($argv, 1) as $argument) {
if (str_starts_with($argument, '--archive_uid=')) {
$archiveUid = substr($argument, strlen('--archive_uid='));
continue;
}
if ($argument === '--force') {
$force = true;
continue;
}
if ($argument === '--dry-run') {
$dryRun = true;
}
}
$query = Db::table('archives')->orderBy('id');
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
if (!$force) {
$query->where(function ($builder) {
$builder->whereNull('content')->orWhere('content', '');
});
}
$archives = $query->get(['archive_uid', 'title', 'content', 'raw'])->all();
$normalizer = new ArticleImportService();
$scanned = 0;
$updated = 0;
$fromRaw = 0;
$fromChunks = 0;
$skipped = 0;
foreach ($archives as $archive) {
$scanned++;
$archiveUidValue = (string) $archive->archive_uid;
$raw = is_string($archive->raw ?? null) ? $archive->raw : null;
$content = null;
$source = 'none';
if (is_string($raw) && trim($raw) !== '') {
$content = $normalizer->normalizeArchiveContentString($raw);
$source = 'raw';
} else {
$chunks = Db::table('chunks')
->where('archive_uid', $archiveUidValue)
->orderBy('chunk_index')
->pluck('text')
->all();
$chunks = array_values(array_filter(array_map(
static fn ($value): string => trim((string) $value),
$chunks
), static fn (string $value): bool => $value !== ''));
if ($chunks !== []) {
$content = trim(implode("\n\n", $chunks));
$source = 'chunks';
}
}
if ($content === null || $content === '') {
$skipped++;
echo "[skip] {$archiveUidValue} no usable raw/chunks" . PHP_EOL;
continue;
}
if ($dryRun) {
echo "[dry-run] {$archiveUidValue} source={$source} content_length=" . mb_strlen($content) . PHP_EOL;
if ($source === 'raw') {
$fromRaw++;
} else {
$fromChunks++;
}
continue;
}
Db::table('archives')
->where('archive_uid', $archiveUidValue)
->update(['content' => $content]);
$updated++;
if ($source === 'raw') {
$fromRaw++;
} else {
$fromChunks++;
}
echo "[updated] {$archiveUidValue} source={$source} content_length=" . mb_strlen($content) . PHP_EOL;
}
echo 'Archive content backfill completed.' . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: 'auto') . PHP_EOL;
echo 'Force mode: ' . ($force ? 'yes' : 'no') . PHP_EOL;
echo 'Dry run: ' . ($dryRun ? 'yes' : 'no') . PHP_EOL;
echo 'Scanned: ' . $scanned . PHP_EOL;
echo 'Updated: ' . $updated . PHP_EOL;
echo 'From raw: ' . $fromRaw . PHP_EOL;
echo 'From chunks: ' . $fromChunks . PHP_EOL;
echo 'Skipped: ' . $skipped . PHP_EOL;