118 lines
3.1 KiB
PHP
118 lines
3.1 KiB
PHP
#!/usr/bin/env php
|
|
<?php
|
|
|
|
use app\service\ArticleImportService;
|
|
use support\Db;
|
|
|
|
require __DIR__ . '/../vendor/autoload.php';
|
|
require __DIR__ . '/../support/bootstrap.php';
|
|
require __DIR__ . '/../vendor/webman/database/src/support/Db.php';
|
|
|
|
$archiveUid = null;
|
|
$force = false;
|
|
$dryRun = false;
|
|
|
|
foreach (array_slice($argv, 1) as $argument) {
|
|
if (str_starts_with($argument, '--archive_uid=')) {
|
|
$archiveUid = substr($argument, strlen('--archive_uid='));
|
|
continue;
|
|
}
|
|
|
|
if ($argument === '--force') {
|
|
$force = true;
|
|
continue;
|
|
}
|
|
|
|
if ($argument === '--dry-run') {
|
|
$dryRun = true;
|
|
}
|
|
}
|
|
|
|
$query = Db::table('archives')->orderBy('id');
|
|
if ($archiveUid !== null && trim($archiveUid) !== '') {
|
|
$query->where('archive_uid', trim($archiveUid));
|
|
}
|
|
|
|
if (!$force) {
|
|
$query->where(function ($builder) {
|
|
$builder->whereNull('content')->orWhere('content', '');
|
|
});
|
|
}
|
|
|
|
$archives = $query->get(['archive_uid', 'title', 'content', 'raw'])->all();
|
|
$normalizer = new ArticleImportService();
|
|
|
|
$scanned = 0;
|
|
$updated = 0;
|
|
$fromRaw = 0;
|
|
$fromChunks = 0;
|
|
$skipped = 0;
|
|
|
|
foreach ($archives as $archive) {
|
|
$scanned++;
|
|
$archiveUidValue = (string) $archive->archive_uid;
|
|
$raw = is_string($archive->raw ?? null) ? $archive->raw : null;
|
|
$content = null;
|
|
$source = 'none';
|
|
|
|
if (is_string($raw) && trim($raw) !== '') {
|
|
$content = $normalizer->normalizeArchiveContentString($raw);
|
|
$source = 'raw';
|
|
} else {
|
|
$chunks = Db::table('chunks')
|
|
->where('archive_uid', $archiveUidValue)
|
|
->orderBy('chunk_index')
|
|
->pluck('text')
|
|
->all();
|
|
|
|
$chunks = array_values(array_filter(array_map(
|
|
static fn ($value): string => trim((string) $value),
|
|
$chunks
|
|
), static fn (string $value): bool => $value !== ''));
|
|
|
|
if ($chunks !== []) {
|
|
$content = trim(implode("\n\n", $chunks));
|
|
$source = 'chunks';
|
|
}
|
|
}
|
|
|
|
if ($content === null || $content === '') {
|
|
$skipped++;
|
|
echo "[skip] {$archiveUidValue} no usable raw/chunks" . PHP_EOL;
|
|
continue;
|
|
}
|
|
|
|
if ($dryRun) {
|
|
echo "[dry-run] {$archiveUidValue} source={$source} content_length=" . mb_strlen($content) . PHP_EOL;
|
|
if ($source === 'raw') {
|
|
$fromRaw++;
|
|
} else {
|
|
$fromChunks++;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
Db::table('archives')
|
|
->where('archive_uid', $archiveUidValue)
|
|
->update(['content' => $content]);
|
|
|
|
$updated++;
|
|
if ($source === 'raw') {
|
|
$fromRaw++;
|
|
} else {
|
|
$fromChunks++;
|
|
}
|
|
|
|
echo "[updated] {$archiveUidValue} source={$source} content_length=" . mb_strlen($content) . PHP_EOL;
|
|
}
|
|
|
|
echo 'Archive content backfill completed.' . PHP_EOL;
|
|
echo 'Archive filter: ' . ($archiveUid ?: 'auto') . PHP_EOL;
|
|
echo 'Force mode: ' . ($force ? 'yes' : 'no') . PHP_EOL;
|
|
echo 'Dry run: ' . ($dryRun ? 'yes' : 'no') . PHP_EOL;
|
|
echo 'Scanned: ' . $scanned . PHP_EOL;
|
|
echo 'Updated: ' . $updated . PHP_EOL;
|
|
echo 'From raw: ' . $fromRaw . PHP_EOL;
|
|
echo 'From chunks: ' . $fromChunks . PHP_EOL;
|
|
echo 'Skipped: ' . $skipped . PHP_EOL;
|