This commit is contained in:
Ziki Shay 2026-05-11 15:23:34 +08:00
parent ed70a140a2
commit 547cbbb4a5
25 changed files with 721 additions and 332 deletions

View File

@ -128,8 +128,6 @@ PATCH /api/admin/archives/{archive_uid}
- `series` - `series`
- `tags` - `tags`
- `metadata` - `metadata`
- `content`
- `raw`
其中: 其中:

View File

@ -52,8 +52,6 @@ curl <APIdomain>/api/archives/01KQHVREB6XPYF604RVZAP9NNY
"provider": "bigmodel" "provider": "bigmodel"
} }
}, },
"content": "full normalized archive content ...",
"raw": "# 1.test ...",
"chunks": [ "chunks": [
"01KQHVREB6XPYF604RVZAP9NNY_1_39003", "01KQHVREB6XPYF604RVZAP9NNY_1_39003",
"01KQHVREB6XPYF604RVZAP9NNY_2_12345" "01KQHVREB6XPYF604RVZAP9NNY_2_12345"
@ -65,8 +63,6 @@ curl <APIdomain>/api/archives/01KQHVREB6XPYF604RVZAP9NNY
说明: 说明:
- `content` 是归一化后的 archive 正文。
- `raw` 是导入时保存的原始 Markdown。
- `chunks` 是当前 archive 关联的 `chunk_uid` 列表。 - `chunks` 是当前 archive 关联的 `chunk_uid` 列表。
- `chunk_count` 方便调用方快速判断档案规模,而不必自己数数组长度。 - `chunk_count` 方便调用方快速判断档案规模,而不必自己数数组长度。

View File

@ -37,9 +37,23 @@ class ArticleImportController
try { try {
$service->persistSnapshot($result['data']); $service->persistSnapshot($result['data']);
(new \app\service\ArchiveRepository())->saveImport($result['data']); (new \app\service\ArchiveRepository())->saveImport($result['data']);
if (($result['data']['queue']['needs_ai_metadata'] ?? false) === true) { if (($result['data']['queue']['needs_ai_metadata'] ?? false) === true && (new \app\service\ArchiveMetadataEnrichmentService())->isEnabled()) {
(new \app\service\AiMetadataQueue())->push($result['data']['archive']['archive_uid']); $archiveUid = $result['data']['archive']['archive_uid'];
(new \app\service\ArchiveRepository())->markAiMetadataStatus($archiveUid, 'queued');
try {
(new \app\service\AiMetadataQueue())->push($archiveUid);
$result['data']['queue']['ai_metadata_enqueued'] = true; $result['data']['queue']['ai_metadata_enqueued'] = true;
} catch (Throwable $queueException) {
(new \app\service\ArchiveRepository())->markAiMetadataStatus($archiveUid, 'failed_retryable', $queueException->getMessage());
$result['data']['queue']['ai_metadata_enqueue_error'] = $queueException->getMessage();
}
} elseif (($result['data']['queue']['needs_ai_metadata'] ?? false) === true) {
(new \app\service\ArchiveRepository())->markAiMetadataStatus(
$result['data']['archive']['archive_uid'],
'disabled',
'AI metadata enrichment is not enabled.'
);
} }
} catch (Throwable $exception) { } catch (Throwable $exception) {
return $this->jsonResponse([ return $this->jsonResponse([

View File

@ -24,6 +24,17 @@ class AiMetadata
public function onWorkerStart(): void public function onWorkerStart(): void
{ {
Timer::add(10, fn (): int => $this->queue->releaseDueDelayed()); Timer::add(10, fn (): int => $this->queue->releaseDueDelayed());
Timer::add(max(30, (int) config('queue.ai_metadata.dispatcher_interval_seconds', 60)), function (): void {
try {
if (!$this->enrichment->isEnabled()) {
return;
}
foreach ($this->archives->queuePendingAiMetadataArchives((int) config('queue.ai_metadata.dispatcher_batch_size', 20)) as $archiveUid) {
$this->queue->push($archiveUid);
}
} catch (Throwable) {
}
});
while (true) { while (true) {
$this->queue->releaseDueDelayed(); $this->queue->releaseDueDelayed();
@ -47,10 +58,18 @@ class AiMetadata
} }
if (!$this->archives->archiveNeedsMetadata($archive)) { if (!$this->archives->archiveNeedsMetadata($archive)) {
$this->archives->markAiMetadataStatus($archiveUid, 'completed', null, ['attempted' => false]);
$this->queue->clearRetry($archiveUid); $this->queue->clearRetry($archiveUid);
return; return;
} }
if (!$this->enrichment->isEnabled()) {
$this->archives->markAiMetadataStatus($archiveUid, 'disabled', 'AI metadata enrichment is not enabled.');
$this->queue->clearRetry($archiveUid);
return;
}
$this->archives->markAiMetadataStatus($archiveUid, 'processing');
$payload = $archive; $payload = $archive;
$payload['content'] = $this->archives->findChunksText($archiveUid); $payload['content'] = $this->archives->findChunksText($archiveUid);
@ -58,7 +77,15 @@ class AiMetadata
$aiMeta = $enriched['metadata']['ai_enrichment'] ?? []; $aiMeta = $enriched['metadata']['ai_enrichment'] ?? [];
if (($aiMeta['attempted'] ?? false) !== true || ($aiMeta['error'] ?? null)) { if (($aiMeta['attempted'] ?? false) !== true || ($aiMeta['error'] ?? null)) {
$this->queue->retryLater($archiveUid, $aiMeta['error'] ?? 'AI metadata enrichment did not complete.'); $retryable = ($aiMeta['enabled'] ?? true) === true && ($aiMeta['error'] ?? null) !== null;
$status = $retryable ? 'failed_retryable' : 'failed_terminal';
$error = $aiMeta['error'] ?? 'AI metadata enrichment did not complete.';
$this->archives->markAiMetadataStatus($archiveUid, $status, $error, $aiMeta);
if ($retryable) {
$this->queue->retryLater($archiveUid, $error);
} else {
$this->queue->clearRetry($archiveUid);
}
return; return;
} }
@ -70,9 +97,16 @@ class AiMetadata
} }
$this->archives->updateMetadata($archiveUid, $fields, $aiMeta); $this->archives->updateMetadata($archiveUid, $fields, $aiMeta);
$this->archives->markAiMetadataStatus($archiveUid, 'completed', null, $aiMeta);
$this->queue->clearRetry($archiveUid); $this->queue->clearRetry($archiveUid);
} catch (Throwable $exception) { } catch (Throwable $exception) {
$status = $this->archives->isAiMetadataRetryable($exception) ? 'failed_retryable' : 'failed_terminal';
$this->archives->markAiMetadataStatus($archiveUid, $status, $exception->getMessage());
if ($status === 'failed_retryable') {
$this->queue->retryLater($archiveUid, $exception->getMessage()); $this->queue->retryLater($archiveUid, $exception->getMessage());
} else {
$this->queue->clearRetry($archiveUid);
}
} }
} }
} }

View File

@ -72,7 +72,7 @@ class ArchiveAdminService
} }
$updates = []; $updates = [];
foreach (['title', 'summary', 'author', 'source', 'series', 'content', 'raw'] as $field) { foreach (['title', 'summary', 'author', 'source', 'series'] as $field) {
if (array_key_exists($field, $payload)) { if (array_key_exists($field, $payload)) {
$updates[$field] = $this->nullableText($payload[$field]); $updates[$field] = $this->nullableText($payload[$field]);
} }
@ -134,8 +134,6 @@ class ArchiveAdminService
{ {
$data = $this->listItem($row); $data = $this->listItem($row);
$data['metadata'] = $this->decodeJson($row->metadata ?? null, []); $data['metadata'] = $this->decodeJson($row->metadata ?? null, []);
$data['content'] = $row->content;
$data['raw'] = $row->raw;
$data['chunks'] = $this->decodeJson($row->chunks ?? null, []); $data['chunks'] = $this->decodeJson($row->chunks ?? null, []);
return $data; return $data;

View File

@ -119,6 +119,14 @@ class MaintenanceScriptService
'doc_name' => 'setup_opensearch.md', 'doc_name' => 'setup_opensearch.md',
'args_hint' => '无参数', 'args_hint' => '无参数',
], ],
'reembed_chunks' => [
'name' => 'reembed_chunks',
'file' => 'reembed_chunks.php',
'label' => '重新生成向量',
'description' => '对 chunks 重新执行 embedding支持 resume 与 --reset。',
'doc_name' => 'reembed_chunks.md',
'args_hint' => '--archive_uid=01... 或 --reset',
],
'reindex_opensearch' => [ 'reindex_opensearch' => [
'name' => 'reindex_opensearch', 'name' => 'reindex_opensearch',
'file' => 'reindex_opensearch.php', 'file' => 'reindex_opensearch.php',
@ -127,14 +135,6 @@ class MaintenanceScriptService
'doc_name' => 'reindex_opensearch.md', 'doc_name' => 'reindex_opensearch.md',
'args_hint' => '--archive_uid=01...', 'args_hint' => '--archive_uid=01...',
], ],
'backfill_archive_content' => [
'name' => 'backfill_archive_content',
'file' => 'backfill_archive_content.php',
'label' => '回填 archive content',
'description' => '从 raw 或 chunks 回填 archives.content。',
'doc_name' => 'backfill_archive_content.md',
'args_hint' => '--archive_uid=01...',
],
'setup_admin_users' => [ 'setup_admin_users' => [
'name' => 'setup_admin_users', 'name' => 'setup_admin_users',
'file' => 'setup_admin_users.php', 'file' => 'setup_admin_users.php',

View File

@ -20,9 +20,9 @@ class ArchiveMetadataEnrichmentService
public function enrich(array $payload): array public function enrich(array $payload): array
{ {
$missing = $this->missingFields($payload); $missing = $this->missingFields($payload);
if ($missing === [] || !$this->enabled()) { if ($missing === [] || !$this->isEnabled()) {
return $this->withAiMeta($payload, [ return $this->withAiMeta($payload, [
'enabled' => $this->enabled(), 'enabled' => $this->isEnabled(),
'attempted' => false, 'attempted' => false,
'filled' => [], 'filled' => [],
'missing' => $missing, 'missing' => $missing,
@ -80,7 +80,7 @@ class ArchiveMetadataEnrichmentService
return array_values(array_filter($fields, fn (string $field): bool => !$this->hasUsefulValue($payload, $field))); return array_values(array_filter($fields, fn (string $field): bool => !$this->hasUsefulValue($payload, $field)));
} }
private function enabled(): bool public function isEnabled(): bool
{ {
return (bool) config('LLMapi.metadata.enabled', true) && $this->client->isConfigured(); return (bool) config('LLMapi.metadata.enabled', true) && $this->client->isConfigured();
} }

View File

@ -2,6 +2,7 @@
namespace app\service; namespace app\service;
use Throwable;
use support\Db; use support\Db;
class ArchiveRepository class ArchiveRepository
@ -24,8 +25,6 @@ class ArchiveRepository
'series' => $archive['series'] ?? null, 'series' => $archive['series'] ?? null,
'tags' => json_encode($archive['tags'] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES), 'tags' => json_encode($archive['tags'] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
'metadata' => json_encode($archive['metadata'] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES), 'metadata' => json_encode($archive['metadata'] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
'content' => $archive['content'] ?? null,
'raw' => $archive['raw'] ?? null,
'chunks' => json_encode($chunkUids, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES), 'chunks' => json_encode($chunkUids, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
] ]
); );
@ -194,6 +193,79 @@ class ArchiveRepository
Db::table('archives')->where('archive_uid', $archiveUid)->update($updates); Db::table('archives')->where('archive_uid', $archiveUid)->update($updates);
} }
public function queuePendingAiMetadataArchives(int $limit): array
{
$limit = max(1, $limit);
$rows = Db::table('archives')
->where(function ($query): void {
$query
->whereNull('title')
->orWhere('title', '')
->orWhereNull('summary')
->orWhere('summary', '')
->orWhereNull('author')
->orWhere('author', '')
->orWhereNull('year')
->orWhere('year', '<=', 0)
->orWhereRaw("tags = '[]'::jsonb")
->orWhereRaw("metadata #>> '{title_source}' = 'fallback'");
})
->orderByDesc('updated_time')
->limit(max($limit * 5, 50))
->get()
->all();
$archiveUids = [];
foreach ($rows as $row) {
$archive = $this->archiveToArray($row);
if (!$this->archiveNeedsMetadata($archive) || !$this->shouldQueueAiMetadata($archive)) {
continue;
}
$archiveUids[] = (string) $archive['archive_uid'];
$this->markAiMetadataStatus((string) $archive['archive_uid'], 'queued', null);
if (count($archiveUids) >= $limit) {
break;
}
}
return $archiveUids;
}
public function markAiMetadataStatus(string $archiveUid, string $status, ?string $error = null, array $extra = []): void
{
$archive = $this->findArchive($archiveUid);
if ($archive === null) {
return;
}
$metadata = is_array($archive['metadata'] ?? null) ? $archive['metadata'] : [];
$aiMeta = is_array($metadata['ai_enrichment'] ?? null) ? $metadata['ai_enrichment'] : [];
$attempt = ((int) ($aiMeta['attempt'] ?? 0)) + ($status === 'processing' ? 1 : 0);
$metadata['ai_enrichment'] = array_merge($aiMeta, $extra, [
'status' => $status,
'attempt' => $attempt,
'updated_at' => date(DATE_ATOM),
'error' => $error,
]);
Db::table('archives')->where('archive_uid', $archiveUid)->update([
'metadata' => json_encode($metadata, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
]);
}
public function isAiMetadataRetryable(Throwable $exception): bool
{
if (!$exception instanceof \app\service\LLM\LLMRequestException) {
return true;
}
$statusCode = $exception->statusCode();
return $statusCode === null || $statusCode === 429 || $statusCode >= 500;
}
public function archiveNeedsMetadata(array $archive): bool public function archiveNeedsMetadata(array $archive): bool
{ {
foreach (['title', 'year', 'author', 'tags', 'summary'] as $field) { foreach (['title', 'year', 'author', 'tags', 'summary'] as $field) {
@ -227,12 +299,29 @@ class ArchiveRepository
'series' => $archive->series, 'series' => $archive->series,
'tags' => json_decode($archive->tags ?? '[]', true) ?: [], 'tags' => json_decode($archive->tags ?? '[]', true) ?: [],
'metadata' => json_decode($archive->metadata ?? '{}', true) ?: [], 'metadata' => json_decode($archive->metadata ?? '{}', true) ?: [],
'content' => $archive->content,
'raw' => $archive->raw,
'chunks' => json_decode($archive->chunks ?? '[]', true) ?: [], 'chunks' => json_decode($archive->chunks ?? '[]', true) ?: [],
]; ];
} }
private function shouldQueueAiMetadata(array $archive): bool
{
$aiMeta = is_array($archive['metadata']['ai_enrichment'] ?? null) ? $archive['metadata']['ai_enrichment'] : [];
$status = (string) ($aiMeta['status'] ?? '');
if ($status === 'failed_terminal') {
return false;
}
$updatedAt = strtotime((string) ($aiMeta['updated_at'] ?? '')) ?: 0;
$staleAfter = max(60, (int) config('queue.ai_metadata.stale_after_seconds', 900));
$isFresh = $updatedAt > 0 && (time() - $updatedAt) < $staleAfter;
if (in_array($status, ['queued', 'processing'], true) && $isFresh) {
return false;
}
return true;
}
private function chunkRowToArray(object $row): array private function chunkRowToArray(object $row): array
{ {
return [ return [

View File

@ -70,16 +70,6 @@ class ArticleImportService
} }
} }
public function normalizeArchiveContentString(string $content): ?string
{
return $this->nullableClean($this->cleanMarkdownPage($content));
}
public function normalizeArchiveRawString(string $content): ?string
{
return $this->nullableClean($content);
}
private function validate(array $payload): array private function validate(array $payload): array
{ {
$errors = []; $errors = [];
@ -192,8 +182,6 @@ class ArticleImportService
'tags' => is_array($payload['tags'] ?? null) ? array_values($payload['tags']) : [], 'tags' => is_array($payload['tags'] ?? null) ? array_values($payload['tags']) : [],
'summary' => $this->nullableClean($payload['summary'] ?? null), 'summary' => $this->nullableClean($payload['summary'] ?? null),
'metadata' => $payload['metadata'] ?? [], 'metadata' => $payload['metadata'] ?? [],
'content' => $this->normalizedArchiveContent($payload),
'raw' => $this->rawArchiveContent($payload),
]; ];
} }
@ -210,57 +198,6 @@ class ArticleImportService
return $this->pageBlocksFromItems($payload, preg_split('/\R{2,}/u', $payload['content'])); return $this->pageBlocksFromItems($payload, preg_split('/\R{2,}/u', $payload['content']));
} }
private function normalizedArchiveContent(array $payload): ?string
{
if (isset($payload['pages']) && is_array($payload['pages'])) {
$parts = [];
foreach ($payload['pages'] as $page) {
if (!is_array($page) || !isset($page['content']) || !is_string($page['content'])) {
continue;
}
$content = $this->cleanMarkdownPage($page['content']);
if ($content !== '') {
$parts[] = $content;
}
}
return $this->nullableClean(implode("\n\n", $parts));
}
if (isset($payload['paragraphs']) && is_array($payload['paragraphs'])) {
$parts = [];
foreach ($payload['paragraphs'] as $paragraph) {
$content = is_array($paragraph) ? ($paragraph['content'] ?? '') : $paragraph;
if (!is_string($content)) {
continue;
}
$content = $this->clean($content);
if ($content !== '') {
$parts[] = $content;
}
}
return $this->nullableClean(implode("\n\n", $parts));
}
if (isset($payload['content']) && is_string($payload['content'])) {
return $this->normalizeArchiveContentString($payload['content']);
}
return null;
}
private function rawArchiveContent(array $payload): ?string
{
if (isset($payload['content']) && is_string($payload['content'])) {
return $this->normalizeArchiveRawString($payload['content']);
}
return null;
}
private function pageBlocksFromPages(array $payload): array private function pageBlocksFromPages(array $payload): array
{ {
$pageBlocks = []; $pageBlocks = [];

View File

@ -22,24 +22,29 @@ class ChunkEmbeddingHandler
$this->retryQueue = $retryQueue ?? new LLMRetryQueue(); $this->retryQueue = $retryQueue ?? new LLMRetryQueue();
} }
public function handle(array $task): void public function handle(array $task): int
{ {
if (($task['target_type'] ?? null) !== 'archive') { if (($task['target_type'] ?? null) !== 'archive') {
return; return 0;
} }
$archiveUid = trim((string) ($task['target_uid'] ?? '')); $archiveUid = trim((string) ($task['target_uid'] ?? ''));
if ($archiveUid === '') { if ($archiveUid === '') {
return; return 0;
} }
$batchSize = (int) config('LLMapi.embedding.batch_size', 32); $batchSize = (int) config('LLMapi.embedding.batch_size', 32);
$chunks = $this->chunks->findQueuedChunks($archiveUid, $batchSize); $chunks = $this->chunks->findQueuedChunks($archiveUid, $batchSize);
if ($chunks === []) { if ($chunks === []) {
return; return 0;
} }
$chunkUids = array_column($chunks, 'chunk_uid'); $chunkUids = array_column($chunks, 'chunk_uid');
if (!$this->client->isConfigured()) {
$this->chunks->markFailed($chunkUids, 'BigModel embedding API is not configured.', false);
return count($chunkUids);
}
$this->chunks->markProcessing($chunkUids); $this->chunks->markProcessing($chunkUids);
try { try {
@ -52,6 +57,7 @@ class ChunkEmbeddingHandler
); );
$this->persistEmbeddings($chunks, $payload); $this->persistEmbeddings($chunks, $payload);
return count($chunkUids);
} catch (Throwable $exception) { } catch (Throwable $exception) {
$this->chunks->markFailed($chunkUids, $exception->getMessage(), $this->isRetryable($exception)); $this->chunks->markFailed($chunkUids, $exception->getMessage(), $this->isRetryable($exception));
throw $exception; throw $exception;

View File

@ -6,11 +6,70 @@ use support\Db;
class ChunkEmbeddingRepository class ChunkEmbeddingRepository
{ {
public function countChunks(?string $archiveUid = null): int
{
$query = Db::table('chunks');
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
return (int) $query->count();
}
public function countChunksByStatuses(array $statuses, ?string $archiveUid = null): int
{
$query = Db::table('chunks')->whereIn('embedding_status', $statuses);
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
return (int) $query->count();
}
public function resetAllChunksToPending(?string $archiveUid = null): int
{
$query = Db::table('chunks');
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
return $query->update($this->pendingResetPayload());
}
public function resetRecoverableChunksToPending(?string $archiveUid = null): int
{
$query = Db::table('chunks')
->whereIn('embedding_status', [
EmbeddingStatus::QUEUED,
EmbeddingStatus::PROCESSING,
EmbeddingStatus::FAILED_RETRYABLE,
]);
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
return $query->update($this->pendingResetPayload());
}
public function queuePendingArchiveTasks(int $limit): array public function queuePendingArchiveTasks(int $limit): array
{ {
$statuses = [EmbeddingStatus::PENDING, EmbeddingStatus::QUEUED, EmbeddingStatus::FAILED_RETRYABLE]; $staleBefore = date('Y-m-d H:i:s', time() - max(60, (int) config('queue.tasks.stale_after_seconds', 900)));
$archiveUids = Db::table('chunks') $archiveUids = Db::table('chunks')
->whereIn('embedding_status', $statuses) ->where(function ($query) use ($staleBefore): void {
$query
->whereIn('embedding_status', [EmbeddingStatus::PENDING, EmbeddingStatus::FAILED_RETRYABLE])
->orWhere(function ($stale) use ($staleBefore): void {
$stale
->whereIn('embedding_status', [EmbeddingStatus::QUEUED, EmbeddingStatus::PROCESSING])
->where(function ($time) use ($staleBefore): void {
$time->whereNull('embedding_updated_at')->orWhere('embedding_updated_at', '<', $staleBefore);
});
});
})
->select('archive_uid') ->select('archive_uid')
->groupBy('archive_uid') ->groupBy('archive_uid')
->orderByRaw('MIN(id)') ->orderByRaw('MIN(id)')
@ -22,7 +81,17 @@ class ChunkEmbeddingRepository
foreach ($archiveUids as $archiveUid) { foreach ($archiveUids as $archiveUid) {
Db::table('chunks') Db::table('chunks')
->where('archive_uid', $archiveUid) ->where('archive_uid', $archiveUid)
->whereIn('embedding_status', $statuses) ->where(function ($query) use ($staleBefore): void {
$query
->whereIn('embedding_status', [EmbeddingStatus::PENDING, EmbeddingStatus::FAILED_RETRYABLE])
->orWhere(function ($stale) use ($staleBefore): void {
$stale
->whereIn('embedding_status', [EmbeddingStatus::QUEUED, EmbeddingStatus::PROCESSING])
->where(function ($time) use ($staleBefore): void {
$time->whereNull('embedding_updated_at')->orWhere('embedding_updated_at', '<', $staleBefore);
});
});
})
->update([ ->update([
'embedding_status' => EmbeddingStatus::QUEUED, 'embedding_status' => EmbeddingStatus::QUEUED,
'embedding_error' => null, 'embedding_error' => null,
@ -94,4 +163,18 @@ class ChunkEmbeddingRepository
'embedding_updated_at' => Db::raw('CURRENT_TIMESTAMP'), 'embedding_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
]); ]);
} }
private function pendingResetPayload(): array
{
return [
'embedding_status' => EmbeddingStatus::PENDING,
'embedding_ref' => null,
'embedding_model' => null,
'embedding_error' => null,
'embedding_updated_at' => null,
'search_index_status' => 0,
'search_index_error' => null,
'search_index_updated_at' => null,
];
}
} }

View File

@ -17,20 +17,20 @@ class ChunkSearchIndexHandler
$this->index = $index ?? new OpenSearchChunkIndex(); $this->index = $index ?? new OpenSearchChunkIndex();
} }
public function handle(array $task): void public function handle(array $task): int
{ {
if (($task['target_type'] ?? null) !== 'archive') { if (($task['target_type'] ?? null) !== 'archive') {
return; return 0;
} }
$archiveUid = trim((string) ($task['target_uid'] ?? '')); $archiveUid = trim((string) ($task['target_uid'] ?? ''));
if ($archiveUid === '') { if ($archiveUid === '') {
return; return 0;
} }
$documents = $this->chunks->findQueuedDocuments($archiveUid, (int) config('opensearch.bulk.chunk_size', 500)); $documents = $this->chunks->findQueuedDocuments($archiveUid, (int) config('opensearch.bulk.chunk_size', 500));
if ($documents === []) { if ($documents === []) {
return; return 0;
} }
$chunkUids = array_column($documents, 'chunk_uid'); $chunkUids = array_column($documents, 'chunk_uid');
@ -39,7 +39,7 @@ class ChunkSearchIndexHandler
try { try {
$documents = $this->validatedDocuments($documents); $documents = $this->validatedDocuments($documents);
if ($documents === []) { if ($documents === []) {
return; return 0;
} }
$chunkUids = array_column($documents, 'chunk_uid'); $chunkUids = array_column($documents, 'chunk_uid');
@ -53,6 +53,7 @@ class ChunkSearchIndexHandler
$indexedChunkUids = array_values(array_diff($chunkUids, $failedChunkUids)); $indexedChunkUids = array_values(array_diff($chunkUids, $failedChunkUids));
$this->chunks->markIndexed($indexedChunkUids); $this->chunks->markIndexed($indexedChunkUids);
return count($chunkUids);
} catch (Throwable $exception) { } catch (Throwable $exception) {
$this->chunks->markFailed($chunkUids, $exception->getMessage(), true); $this->chunks->markFailed($chunkUids, $exception->getMessage(), true);
throw $exception; throw $exception;

View File

@ -7,6 +7,31 @@ use support\Db;
class ChunkSearchIndexRepository class ChunkSearchIndexRepository
{ {
public function countEmbeddedChunks(?string $archiveUid = null): int
{
$query = Db::table('chunks')
->where('embedding_status', EmbeddingStatus::EMBEDDED);
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
return (int) $query->count();
}
public function countIndexedChunks(?string $archiveUid = null): int
{
$query = Db::table('chunks')
->where('embedding_status', EmbeddingStatus::EMBEDDED)
->where('search_index_status', SearchIndexStatus::INDEXED);
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
return (int) $query->count();
}
public function resetEmbeddedChunksToPending(?string $archiveUid = null): int public function resetEmbeddedChunksToPending(?string $archiveUid = null): int
{ {
$query = Db::table('chunks') $query = Db::table('chunks')
@ -23,18 +48,44 @@ class ChunkSearchIndexRepository
]); ]);
} }
public function queuePendingArchiveTasks(int $limit): array public function resetRecoverableChunksToPending(?string $archiveUid = null): int
{ {
$statuses = [ $query = Db::table('chunks')
SearchIndexStatus::PENDING, ->where('embedding_status', EmbeddingStatus::EMBEDDED)
->whereIn('search_index_status', [
SearchIndexStatus::QUEUED, SearchIndexStatus::QUEUED,
SearchIndexStatus::INDEXING, SearchIndexStatus::INDEXING,
SearchIndexStatus::FAILED_RETRYABLE, SearchIndexStatus::FAILED_RETRYABLE,
]; ]);
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
return $query->update([
'search_index_status' => SearchIndexStatus::PENDING,
'search_index_error' => null,
'search_index_updated_at' => null,
]);
}
public function queuePendingArchiveTasks(int $limit): array
{
$staleBefore = date('Y-m-d H:i:s', time() - max(60, (int) config('queue.tasks.stale_after_seconds', 900)));
$archiveUids = Db::table('chunks') $archiveUids = Db::table('chunks')
->where('embedding_status', EmbeddingStatus::EMBEDDED) ->where('embedding_status', EmbeddingStatus::EMBEDDED)
->whereIn('search_index_status', $statuses) ->where(function ($query) use ($staleBefore): void {
$query
->whereIn('search_index_status', [SearchIndexStatus::PENDING, SearchIndexStatus::FAILED_RETRYABLE])
->orWhere(function ($stale) use ($staleBefore): void {
$stale
->whereIn('search_index_status', [SearchIndexStatus::QUEUED, SearchIndexStatus::INDEXING])
->where(function ($time) use ($staleBefore): void {
$time->whereNull('search_index_updated_at')->orWhere('search_index_updated_at', '<', $staleBefore);
});
});
})
->select('archive_uid') ->select('archive_uid')
->groupBy('archive_uid') ->groupBy('archive_uid')
->orderByRaw('MIN(id)') ->orderByRaw('MIN(id)')
@ -47,7 +98,17 @@ class ChunkSearchIndexRepository
Db::table('chunks') Db::table('chunks')
->where('archive_uid', $archiveUid) ->where('archive_uid', $archiveUid)
->where('embedding_status', EmbeddingStatus::EMBEDDED) ->where('embedding_status', EmbeddingStatus::EMBEDDED)
->whereIn('search_index_status', $statuses) ->where(function ($query) use ($staleBefore): void {
$query
->whereIn('search_index_status', [SearchIndexStatus::PENDING, SearchIndexStatus::FAILED_RETRYABLE])
->orWhere(function ($stale) use ($staleBefore): void {
$stale
->whereIn('search_index_status', [SearchIndexStatus::QUEUED, SearchIndexStatus::INDEXING])
->where(function ($time) use ($staleBefore): void {
$time->whereNull('search_index_updated_at')->orWhere('search_index_updated_at', '<', $staleBefore);
});
});
})
->update([ ->update([
'search_index_status' => SearchIndexStatus::QUEUED, 'search_index_status' => SearchIndexStatus::QUEUED,
'search_index_error' => null, 'search_index_error' => null,

View File

@ -139,12 +139,6 @@
<label class="field-label" for="archive-metadata">metadata</label> <label class="field-label" for="archive-metadata">metadata</label>
<textarea class="text-area admin-code-area" id="archive-metadata" name="metadata" rows="8" placeholder='{"key":"value"}'></textarea> <textarea class="text-area admin-code-area" id="archive-metadata" name="metadata" rows="8" placeholder='{"key":"value"}'></textarea>
<label class="field-label" for="archive-content">content</label>
<textarea class="text-area admin-code-area" id="archive-content" name="content" rows="10"></textarea>
<label class="field-label" for="archive-raw">raw</label>
<textarea class="text-area admin-code-area" id="archive-raw" name="raw" rows="10"></textarea>
<div class="admin-form-actions"> <div class="admin-form-actions">
<button class="button primary" type="submit">保存档案</button> <button class="button primary" type="submit">保存档案</button>
<button class="button" type="button" id="archive-delete">删除档案</button> <button class="button" type="button" id="archive-delete">删除档案</button>
@ -525,8 +519,6 @@ async function loadArchiveDetail(archiveUid) {
field(els.archiveForm, 'tags').value = (data.tags || []).join(', '); field(els.archiveForm, 'tags').value = (data.tags || []).join(', ');
field(els.archiveForm, 'summary').value = data.summary || ''; field(els.archiveForm, 'summary').value = data.summary || '';
field(els.archiveForm, 'metadata').value = JSON.stringify(data.metadata || {}, null, 2); field(els.archiveForm, 'metadata').value = JSON.stringify(data.metadata || {}, null, 2);
field(els.archiveForm, 'content').value = data.content || '';
field(els.archiveForm, 'raw').value = data.raw || '';
} }
async function saveArchive(event) { async function saveArchive(event) {
@ -553,9 +545,7 @@ async function saveArchive(event) {
series: field(els.archiveForm, 'series').value, series: field(els.archiveForm, 'series').value,
tags: field(els.archiveForm, 'tags').value, tags: field(els.archiveForm, 'tags').value,
summary: field(els.archiveForm, 'summary').value, summary: field(els.archiveForm, 'summary').value,
metadata, metadata
content: field(els.archiveForm, 'content').value,
raw: field(els.archiveForm, 'raw').value
}; };
await api(`/api/admin/archives/${encodeURIComponent(state.archiveUid)}`, { await api(`/api/admin/archives/${encodeURIComponent(state.archiveUid)}`, {

View File

@ -11,6 +11,9 @@ return [
'base_delay_seconds' => (int) (getenv('AI_METADATA_QUEUE_BASE_DELAY_SECONDS') ?: 60), 'base_delay_seconds' => (int) (getenv('AI_METADATA_QUEUE_BASE_DELAY_SECONDS') ?: 60),
'block_timeout' => (int) (getenv('AI_METADATA_QUEUE_BLOCK_TIMEOUT') ?: 5), 'block_timeout' => (int) (getenv('AI_METADATA_QUEUE_BLOCK_TIMEOUT') ?: 5),
'idle_sleep_seconds' => (int) (getenv('AI_METADATA_QUEUE_IDLE_SLEEP_SECONDS') ?: 1), 'idle_sleep_seconds' => (int) (getenv('AI_METADATA_QUEUE_IDLE_SLEEP_SECONDS') ?: 1),
'dispatcher_interval_seconds' => (int) (getenv('AI_METADATA_QUEUE_DISPATCHER_INTERVAL_SECONDS') ?: 60),
'dispatcher_batch_size' => (int) (getenv('AI_METADATA_QUEUE_DISPATCHER_BATCH_SIZE') ?: 20),
'stale_after_seconds' => (int) (getenv('AI_METADATA_QUEUE_STALE_AFTER_SECONDS') ?: 900),
], ],
'tasks' => [ 'tasks' => [
'pending' => 'proofdb:tasks:pending', 'pending' => 'proofdb:tasks:pending',
@ -24,5 +27,6 @@ return [
'idle_sleep_seconds' => (int) (getenv('PROOFDB_TASK_QUEUE_IDLE_SLEEP_SECONDS') ?: 1), 'idle_sleep_seconds' => (int) (getenv('PROOFDB_TASK_QUEUE_IDLE_SLEEP_SECONDS') ?: 1),
'dispatcher_interval_seconds' => (int) (getenv('PROOFDB_TASK_DISPATCHER_INTERVAL_SECONDS') ?: 15), 'dispatcher_interval_seconds' => (int) (getenv('PROOFDB_TASK_DISPATCHER_INTERVAL_SECONDS') ?: 15),
'dispatcher_batch_size' => (int) (getenv('PROOFDB_TASK_DISPATCHER_BATCH_SIZE') ?: 20), 'dispatcher_batch_size' => (int) (getenv('PROOFDB_TASK_DISPATCHER_BATCH_SIZE') ?: 20),
'stale_after_seconds' => (int) (getenv('PROOFDB_TASK_STALE_AFTER_SECONDS') ?: 900),
], ],
]; ];

View File

@ -262,7 +262,8 @@ GET /api/evidence/{chunk_uid}
- [x] Archive detail API is implemented with external documentation: `GET /api/archives/{archive_uid}`. - [x] Archive detail API is implemented with external documentation: `GET /api/archives/{archive_uid}`.
- [x] Archive chunk-list and archive evidence-list APIs are implemented with external documentation: `GET /api/archives/{archive_uid}/chunks` and `GET /api/archives/{archive_uid}/evidence`. - [x] Archive chunk-list and archive evidence-list APIs are implemented with external documentation: `GET /api/archives/{archive_uid}/chunks` and `GET /api/archives/{archive_uid}/evidence`.
- [x] Evidence smoke test passed for `01KQHVREB6XPYF604RVZAP9NNY_1_39003`, returning page label, citation string, and chunk quote. - [x] Evidence smoke test passed for `01KQHVREB6XPYF604RVZAP9NNY_1_39003`, returning page label, citation string, and chunk quote.
- [x] Historical `archives.content` can now be repaired with `php scripts/backfill_archive_content.php`, using normalized `raw` when available and ordered chunk text as fallback. - [x] `archives` rows should no longer persist redundant `content` or `raw` bodies. Archive body reconstruction should come from chunks or the original Markdown source outside PostgreSQL.
- [x] AI metadata, embedding, and OpenSearch indexing paths now have resumable recovery logic for stale queued/processing work instead of relying only on in-memory progress.
- [x] OpenSearch repair/reindex maintenance script exists: `php scripts/reindex_opensearch.php`, with optional `--archive_uid=...` targeting. - [x] OpenSearch repair/reindex maintenance script exists: `php scripts/reindex_opensearch.php`, with optional `--archive_uid=...` targeting.
### Partially Done ### Partially Done

View File

@ -5,8 +5,8 @@
- [setup_database.md](/www/proofdb/scriptdoc/setup_database.md): PostgreSQL 结构初始化与升级 - [setup_database.md](/www/proofdb/scriptdoc/setup_database.md): PostgreSQL 结构初始化与升级
- [setup_admin_users.md](/www/proofdb/scriptdoc/setup_admin_users.md): 管理员用户表与首个管理员账号初始化 - [setup_admin_users.md](/www/proofdb/scriptdoc/setup_admin_users.md): 管理员用户表与首个管理员账号初始化
- [setup_opensearch.md](/www/proofdb/scriptdoc/setup_opensearch.md): OpenSearch 索引初始化 - [setup_opensearch.md](/www/proofdb/scriptdoc/setup_opensearch.md): OpenSearch 索引初始化
- [reembed_chunks.md](/www/proofdb/scriptdoc/reembed_chunks.md): chunk 向量重新生成与断点续跑
- [reindex_opensearch.md](/www/proofdb/scriptdoc/reindex_opensearch.md): OpenSearch 重建索引与回灌 - [reindex_opensearch.md](/www/proofdb/scriptdoc/reindex_opensearch.md): OpenSearch 重建索引与回灌
- [backfill_archive_content.md](/www/proofdb/scriptdoc/backfill_archive_content.md): 历史 archive `content` 正文字段回填
## 当前运维脚本 ## 当前运维脚本
@ -14,8 +14,8 @@
scripts/setup_database.php scripts/setup_database.php
scripts/setup_admin_users.php scripts/setup_admin_users.php
scripts/setup_opensearch.php scripts/setup_opensearch.php
scripts/reembed_chunks.php
scripts/reindex_opensearch.php scripts/reindex_opensearch.php
scripts/backfill_archive_content.php
``` ```
## 推荐顺序 ## 推荐顺序

View File

@ -1,76 +1,13 @@
# Archive Content 回填脚本 # Archive Content 回填脚本(已废弃)
## 脚本路径 `archives.content``archives.raw` 已经从 PostgreSQL 设计中移除。
```text 因此:
scripts/backfill_archive_content.php
```
## 脚本作用 - `scripts/backfill_archive_content.php` 不再执行任何回填逻辑
- 该脚本现在只会输出废弃提示并退出
回填历史 `archives.content` 字段。 当前约定是:
这个脚本主要用于修复旧数据中 `content` 为空的问题。它会按下面顺序尝试生成 `content` - archive 正文从 `chunks` 重建
- 原始 Markdown 由 PostgreSQL 外部的原始来源负责保存
1. 如果 archive 有 `raw`,就按当前导入规则把原始 Markdown 规范化成正文文本。
2. 如果 `raw` 为空,就按 `chunk_index` 顺序拼接现有 chunk 的 `text` 作为回退正文。
脚本不会伪造 `raw`。如果历史数据里 `raw` 丢了,脚本只会尽力补 `content`
## 运行前提
- 当前环境中的 PostgreSQL 配置可用。
- 项目依赖已安装完成。
- 从项目根目录执行命令。
## 运行命令
默认只处理 `content` 为空的 archive
```bash
php scripts/backfill_archive_content.php
```
只处理一个 archive
```bash
php scripts/backfill_archive_content.php --archive_uid=01KQHVREB6XPYF604RVZAP9NNY
```
强制重算,即使 `content` 已经有值:
```bash
php scripts/backfill_archive_content.php --force
```
只预览,不写数据库:
```bash
php scripts/backfill_archive_content.php --dry-run
```
## 成功输出示例
```text
[updated] 01KQHVREB6XPYF604RVZAP9NNY source=chunks content_length=6375
Archive content backfill completed.
Archive filter: auto
Force mode: no
Dry run: no
Scanned: 1
Updated: 1
From raw: 0
From chunks: 1
Skipped: 0
```
## 适用场景
- 修复旧版本导入留下的 `archives.content` 为空问题。
- 导入逻辑更新后,希望重算归一化正文。
- 为后续 AI / RAG / archive 级读取补齐正文字段。
## 重要限制
- 如果历史数据既没有 `raw`,也没有 chunks脚本会跳过该 archive。
- 用 chunks 回填时,得到的是拼接后的正文文本,不会恢复原始 Markdown 结构。

113
scriptdoc/reembed_chunks.md Normal file
View File

@ -0,0 +1,113 @@
# Chunk 重新 Embedding 脚本
## 脚本路径
```text
scripts/reembed_chunks.php
```
## 脚本作用
根据 PostgreSQL 中已有的 `chunks.text`,重新生成 embedding并把结果写回 `chunks.embedding_*` 字段。
脚本会做这些事:
1. 判断当前是“断点续跑”还是“全量重建”。
2. 把需要恢复的 chunk 状态置回 `pending`
3. 逐个 archive 拉取待处理 chunk并按 embedding request batch 调用现有 handler。
4. 输出基于 chunk 数的进度条、batch 明细和最终统计结果。
## 断点续跑语义
默认模式下,脚本会尽量**保留已经完成的 `embedded` 进度**
- 只把 `queued / processing / failed_retryable` 的 chunk 重新置回 `pending`
- 已经是 `embedded` 的 chunk 不会重复生成
- `failed_terminal` 也不会自动重试
这意味着如果中途断电、断网、进程被杀:
- 已经成功写回 embedding 的 chunk 进度不会丢
- 未完成的那部分在下次重跑脚本时会继续
如果你明确想从头开始全部重新 embedding可以手动加 `--reset`
## 运行前提
- PostgreSQL 可连接。
- BigModel / Zhipu embedding API 配置可用。
- 项目依赖已安装完成。
- 从项目根目录执行命令。
## 运行命令
默认断点续跑:
```bash
php scripts/reembed_chunks.php
```
强制从头全量重新 embedding
```bash
php scripts/reembed_chunks.php --reset
```
只处理一个 archive
```bash
php scripts/reembed_chunks.php --archive_uid=01KQHVREB6XPYF604RVZAP9NNY
```
## 成功输出示例
```text
Progress granularity: embedding request batches (up to 32 chunks each)
Re-embedding [================================] 100.0% (14/14)
Batch #1 archive=01KQHVREB6XPYF604RVZAP9NNY chunks=14 progress=14/14
Chunk re-embedding completed.
Archive filter: (all chunks)
Mode: resume
Eligible chunks: 14
Embedding batch size: 32
Reset chunks: 2
Processed archives: 1
Processed batches: 1
Embedded chunk rows now marked embedded: 14
Terminal failures: 0
Archives: 01KQHVREB6XPYF604RVZAP9NNY
Next step: refresh OpenSearch vectors with `php scripts/reindex_opensearch.php`
```
## 关于进度条为什么可能“0 到 100”
这个脚本的进度条是按 **chunk 数** 计算的,但刷新粒度是 **一次 embedding request batch 完成后**
所以如果:
- 当前只有一个 archive 需要处理
- 且该 archive 的待处理 chunk 数小于等于 `LLM_EMBEDDING_BATCH_SIZE`
那么整个 archive 会在 1 次请求里完成,终端看起来就会像是从 `0` 直接跳到 `100`。这不是进度丢失,而是因为这次实际只跑了 `1` 个 batch。
## reset 行为
`--reset` 后,会把目标范围内的 chunk
- `embedding_status` 重置为 `pending`
- 清空 `embedding_ref`
- 清空 `embedding_model`
- 清空 `embedding_error`
- 同时把 `search_index_status` 置回 `pending`
所以 `--reset` 的语义是:**把 embedding 和后续索引链路都当成需要重建**。
## 重要限制
这个脚本不会直接重建 OpenSearch。
重新 embedding 完成后,如果你希望 OpenSearch 中也使用新的向量,请继续执行:
```bash
php scripts/reindex_opensearch.php
```

View File

@ -13,10 +13,29 @@ scripts/reindex_opensearch.php
脚本会做这些事: 脚本会做这些事:
1. 确保 OpenSearch 索引存在。 1. 确保 OpenSearch 索引存在。
2. 把已向量化 chunk 的 `search_index_status` 重置为待索引。 2. 判断当前是“断点续跑”还是“全量重建”。
3. 按 archive 批量重新投递索引任务。 3. 按 archive 选取待处理数据,并按 OpenSearch bulk batch 调用现有 indexing handler。
4. 调用现有 OpenSearch indexing handler 批量写入 chunk 文档。 4. 批量写入 chunk 文档。
5. 输出重建统计结果。 5. 输出基于 chunk 数的进度条、batch 明细和最终统计结果。
## 断点续跑语义
默认模式下,脚本会尽量**保留已经完成的 `indexed` 进度**
- 如果 OpenSearch 索引还在,就只把 `queued / indexing / failed_retryable` 的 chunk 重新置回 `pending`,然后继续处理。
- 如果 OpenSearch 索引已经不存在,脚本会自动切换到全量重建模式,把所有已向量化 chunk 重新置回 `pending`
这意味着如果中途断电、断网、进程被杀:
- 已经标记为 `indexed` 的 chunk 进度不会丢。
- 未完成的那部分在下次重跑脚本时会继续。
如果你明确想从头开始全部重建,可以手动加 `--reset`
更具体地说:
- 如果 PostgreSQL 里该 chunk 已经是 `indexed`,默认 `resume` 不会重写它,即使 OpenSearch 里同 `_id` 文档已经存在。
- `--reset` 会把目标范围内所有已向量化 chunk 的 `search_index_status` 统一重置为 `pending`,然后重新 upsert 到 OpenSearch。
## 运行前提 ## 运行前提
@ -40,6 +59,12 @@ OPENSEARCH_SSL_VERIFY=false php scripts/reindex_opensearch.php
php scripts/reindex_opensearch.php php scripts/reindex_opensearch.php
``` ```
强制从头全量重建:
```bash
php scripts/reindex_opensearch.php --reset
```
只重建一个 archive 只重建一个 archive
```bash ```bash
@ -49,15 +74,33 @@ php scripts/reindex_opensearch.php --archive_uid=01KQHVREB6XPYF604RVZAP9NNY
## 成功输出示例 ## 成功输出示例
```text ```text
Progress granularity: OpenSearch bulk batches (up to 500 chunks each)
Reindexing [================================] 100.0% (14/14)
Batch #1 archive=01KQHVREB6XPYF604RVZAP9NNY chunks=14 progress=14/14
OpenSearch reindex completed. OpenSearch reindex completed.
Index: proofdb_chunks Index: proofdb_chunks
Archive filter: (all embedded archives) Archive filter: (all embedded archives)
Reset chunks: 14 Mode: resume
Eligible embedded chunks: 14
OpenSearch bulk size: 500
Reset chunks: 3
Indexed archives: 1 Indexed archives: 1
Processed batches: 1
Indexed chunk rows now marked indexed: 14 Indexed chunk rows now marked indexed: 14
Archives: 01KQHVREB6XPYF604RVZAP9NNY Archives: 01KQHVREB6XPYF604RVZAP9NNY
``` ```
## 关于进度条为什么可能“0 到 100”
这个脚本的进度条是按 **chunk 数** 计算的,但刷新粒度是 **一次 OpenSearch bulk batch 完成后**
所以如果:
- 当前只有一个 archive 需要处理
- 且该 archive 的待处理 chunk 数小于等于 `opensearch.bulk.chunk_size`
那么终端上就会像是从 `0` 直接跳到 `100`。这不是脚本没工作,而是因为这次实际只跑了 `1` 个 bulk batch。
## 适用场景 ## 适用场景
- `proofdb_chunks` 被误删后恢复。 - `proofdb_chunks` 被误删后恢复。

View File

@ -14,6 +14,7 @@ scripts/setup_database.php
- `archives` 表存在。 - `archives` 表存在。
- `chunks` 表存在。 - `chunks` 表存在。
- `archives.content``archives.raw` 冗余列会被移除。
- 档案与 chunk 的常用索引存在。 - 档案与 chunk 的常用索引存在。
- embedding / search index 相关状态字段存在。 - embedding / search index 相关状态字段存在。
- `updated_time` 自动更新时间触发器存在。 - `updated_time` 自动更新时间触发器存在。

View File

@ -1,117 +1,5 @@
#!/usr/bin/env php #!/usr/bin/env php
<?php <?php
use app\service\ArticleImportService; fwrite(STDERR, "Deprecated: archives.content and archives.raw have been removed from PostgreSQL. This script is no longer supported.\n");
use support\Db; exit(1);
require __DIR__ . '/../vendor/autoload.php';
require __DIR__ . '/../support/bootstrap.php';
require __DIR__ . '/../vendor/webman/database/src/support/Db.php';
$archiveUid = null;
$force = false;
$dryRun = false;
foreach (array_slice($argv, 1) as $argument) {
if (str_starts_with($argument, '--archive_uid=')) {
$archiveUid = substr($argument, strlen('--archive_uid='));
continue;
}
if ($argument === '--force') {
$force = true;
continue;
}
if ($argument === '--dry-run') {
$dryRun = true;
}
}
$query = Db::table('archives')->orderBy('id');
if ($archiveUid !== null && trim($archiveUid) !== '') {
$query->where('archive_uid', trim($archiveUid));
}
if (!$force) {
$query->where(function ($builder) {
$builder->whereNull('content')->orWhere('content', '');
});
}
$archives = $query->get(['archive_uid', 'title', 'content', 'raw'])->all();
$normalizer = new ArticleImportService();
$scanned = 0;
$updated = 0;
$fromRaw = 0;
$fromChunks = 0;
$skipped = 0;
foreach ($archives as $archive) {
$scanned++;
$archiveUidValue = (string) $archive->archive_uid;
$raw = is_string($archive->raw ?? null) ? $archive->raw : null;
$content = null;
$source = 'none';
if (is_string($raw) && trim($raw) !== '') {
$content = $normalizer->normalizeArchiveContentString($raw);
$source = 'raw';
} else {
$chunks = Db::table('chunks')
->where('archive_uid', $archiveUidValue)
->orderBy('chunk_index')
->pluck('text')
->all();
$chunks = array_values(array_filter(array_map(
static fn ($value): string => trim((string) $value),
$chunks
), static fn (string $value): bool => $value !== ''));
if ($chunks !== []) {
$content = trim(implode("\n\n", $chunks));
$source = 'chunks';
}
}
if ($content === null || $content === '') {
$skipped++;
echo "[skip] {$archiveUidValue} no usable raw/chunks" . PHP_EOL;
continue;
}
if ($dryRun) {
echo "[dry-run] {$archiveUidValue} source={$source} content_length=" . mb_strlen($content) . PHP_EOL;
if ($source === 'raw') {
$fromRaw++;
} else {
$fromChunks++;
}
continue;
}
Db::table('archives')
->where('archive_uid', $archiveUidValue)
->update(['content' => $content]);
$updated++;
if ($source === 'raw') {
$fromRaw++;
} else {
$fromChunks++;
}
echo "[updated] {$archiveUidValue} source={$source} content_length=" . mb_strlen($content) . PHP_EOL;
}
echo 'Archive content backfill completed.' . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: 'auto') . PHP_EOL;
echo 'Force mode: ' . ($force ? 'yes' : 'no') . PHP_EOL;
echo 'Dry run: ' . ($dryRun ? 'yes' : 'no') . PHP_EOL;
echo 'Scanned: ' . $scanned . PHP_EOL;
echo 'Updated: ' . $updated . PHP_EOL;
echo 'From raw: ' . $fromRaw . PHP_EOL;
echo 'From chunks: ' . $fromChunks . PHP_EOL;
echo 'Skipped: ' . $skipped . PHP_EOL;

135
scripts/reembed_chunks.php Normal file
View File

@ -0,0 +1,135 @@
#!/usr/bin/env php
<?php
use app\service\Embedding\ChunkEmbeddingHandler;
use app\service\Embedding\ChunkEmbeddingRepository;
use app\service\Embedding\EmbeddingStatus;
require __DIR__ . '/../vendor/autoload.php';
require __DIR__ . '/../support/bootstrap.php';
$archiveUid = null;
$forceReset = false;
foreach (array_slice($argv, 1) as $argument) {
if (str_starts_with($argument, '--archive_uid=')) {
$archiveUid = substr($argument, strlen('--archive_uid='));
continue;
}
if ($argument === '--reset') {
$forceReset = true;
}
}
$repository = new ChunkEmbeddingRepository();
$handler = new ChunkEmbeddingHandler();
$batchSize = max(1, (int) config('LLMapi.embedding.batch_size', 32));
try {
$totalChunks = $repository->countChunks($archiveUid);
if ($totalChunks === 0) {
echo 'Chunk re-embedding completed.' . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: '(all chunks)') . PHP_EOL;
echo 'Mode: nothing-to-do' . PHP_EOL;
echo 'Eligible chunks: 0' . PHP_EOL;
exit(0);
}
$mode = $forceReset ? 'reset' : 'resume';
$resetCount = $mode === 'reset'
? $repository->resetAllChunksToPending($archiveUid)
: $repository->resetRecoverableChunksToPending($archiveUid);
$batchCount = 0;
$processedArchives = [];
$progress = completedCount($repository, $archiveUid);
echo 'Progress granularity: embedding request batches (up to ' . $batchSize . ' chunks each)' . PHP_EOL;
renderProgress($progress, $totalChunks, 'Re-embedding');
while (true) {
$archiveUids = $repository->queuePendingArchiveTasks(100);
if ($archiveUid !== null && trim($archiveUid) !== '') {
$archiveUids = array_values(array_filter($archiveUids, static fn (string $uid): bool => $uid === trim($archiveUid)));
}
if ($archiveUids === []) {
break;
}
foreach ($archiveUids as $uid) {
$processedChunkCount = $handler->handle([
'task_type' => 'embedding',
'target_type' => 'archive',
'target_uid' => $uid,
'attempt' => 1,
]);
if ($processedChunkCount <= 0) {
continue;
}
$batchCount++;
$processedArchives[] = $uid;
$progress = completedCount($repository, $archiveUid);
renderProgress($progress, $totalChunks, 'Re-embedding');
fwrite(STDOUT, PHP_EOL . sprintf(
'Batch #%d archive=%s chunks=%d progress=%d/%d%s',
$batchCount,
$uid,
$processedChunkCount,
$progress,
$totalChunks,
PHP_EOL
));
}
}
$embeddedChunks = $repository->countChunksByStatuses([EmbeddingStatus::EMBEDDED], $archiveUid);
$terminalFailures = $repository->countChunksByStatuses([EmbeddingStatus::FAILED_TERMINAL], $archiveUid);
renderProgress($embeddedChunks + $terminalFailures, $totalChunks, 'Re-embedding', true);
echo 'Chunk re-embedding completed.' . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: '(all chunks)') . PHP_EOL;
echo 'Mode: ' . $mode . ($forceReset ? ' (--reset)' : '') . PHP_EOL;
echo 'Eligible chunks: ' . $totalChunks . PHP_EOL;
echo 'Embedding batch size: ' . $batchSize . PHP_EOL;
echo 'Reset chunks: ' . $resetCount . PHP_EOL;
echo 'Processed archives: ' . count(array_unique($processedArchives)) . PHP_EOL;
echo 'Processed batches: ' . $batchCount . PHP_EOL;
echo 'Embedded chunk rows now marked embedded: ' . $embeddedChunks . PHP_EOL;
echo 'Terminal failures: ' . $terminalFailures . PHP_EOL;
if ($processedArchives !== []) {
echo 'Archives: ' . implode(', ', $processedArchives) . PHP_EOL;
}
echo 'Next step: refresh OpenSearch vectors with `php scripts/reindex_opensearch.php'
. ($archiveUid ? ' --archive_uid=' . $archiveUid : '')
. ($forceReset ? ' --reset' : '')
. '`' . PHP_EOL;
} catch (Throwable $exception) {
fwrite(STDERR, PHP_EOL . $exception::class . ': ' . $exception->getMessage() . PHP_EOL);
exit(1);
}
function completedCount(ChunkEmbeddingRepository $repository, ?string $archiveUid): int
{
return $repository->countChunksByStatuses([
EmbeddingStatus::EMBEDDED,
EmbeddingStatus::FAILED_TERMINAL,
], $archiveUid);
}
function renderProgress(int $done, int $total, string $label, bool $final = false): void
{
$total = max(1, $total);
$done = max(0, min($done, $total));
$width = 32;
$filled = (int) floor(($done / $total) * $width);
$bar = str_repeat('=', $filled) . str_repeat(' ', max(0, $width - $filled));
$percent = str_pad(number_format(($done / $total) * 100, 1), 5, ' ', STR_PAD_LEFT);
$line = sprintf("\r%s [%s] %s%% (%d/%d)", $label, $bar, $percent, $done, $total);
fwrite(STDOUT, $line);
if ($final || $done >= $total) {
fwrite(STDOUT, PHP_EOL);
}
}

View File

@ -3,31 +3,59 @@
use app\service\Search\ChunkSearchIndexHandler; use app\service\Search\ChunkSearchIndexHandler;
use app\service\Search\ChunkSearchIndexRepository; use app\service\Search\ChunkSearchIndexRepository;
use app\service\Search\OpenSearchClientFactory;
use app\service\Search\OpenSearchChunkIndex; use app\service\Search\OpenSearchChunkIndex;
use support\Db;
require __DIR__ . '/../vendor/autoload.php'; require __DIR__ . '/../vendor/autoload.php';
require __DIR__ . '/../support/bootstrap.php'; require __DIR__ . '/../support/bootstrap.php';
require __DIR__ . '/../vendor/webman/database/src/support/Db.php';
$archiveUid = null; $archiveUid = null;
$forceReset = false;
foreach (array_slice($argv, 1) as $argument) { foreach (array_slice($argv, 1) as $argument) {
if (str_starts_with($argument, '--archive_uid=')) { if (str_starts_with($argument, '--archive_uid=')) {
$archiveUid = substr($argument, strlen('--archive_uid=')); $archiveUid = substr($argument, strlen('--archive_uid='));
continue;
}
if ($argument === '--reset') {
$forceReset = true;
} }
} }
$repository = new ChunkSearchIndexRepository(); $repository = new ChunkSearchIndexRepository();
$handler = new ChunkSearchIndexHandler(); $handler = new ChunkSearchIndexHandler();
$index = new OpenSearchChunkIndex(); $index = new OpenSearchChunkIndex();
$clientFactory = new OpenSearchClientFactory();
$bulkSize = max(1, (int) config('opensearch.bulk.chunk_size', 500));
try { try {
$client = $clientFactory->make();
$indexName = config('opensearch.indices.chunks', 'proofdb_chunks');
$indexExists = (bool) $client->indices()->exists(['index' => $indexName]);
$index->ensureExists(); $index->ensureExists();
$resetCount = $repository->resetEmbeddedChunksToPending($archiveUid); $totalChunks = $repository->countEmbeddedChunks($archiveUid);
$archiveCount = 0; if ($totalChunks === 0) {
echo 'OpenSearch reindex completed.' . PHP_EOL;
echo 'Index: ' . $indexName . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: '(all embedded archives)') . PHP_EOL;
echo 'Mode: nothing-to-do' . PHP_EOL;
echo 'Eligible embedded chunks: 0' . PHP_EOL;
exit(0);
}
$mode = $forceReset || !$indexExists ? 'reset' : 'resume';
$resetCount = $mode === 'reset'
? $repository->resetEmbeddedChunksToPending($archiveUid)
: $repository->resetRecoverableChunksToPending($archiveUid);
$batchCount = 0;
$indexedArchives = []; $indexedArchives = [];
$indexedChunks = 0; $progress = $repository->countIndexedChunks($archiveUid);
echo 'Progress granularity: OpenSearch bulk batches (up to ' . $bulkSize . ' chunks each)' . PHP_EOL;
renderProgress($progress, $totalChunks, 'Reindexing');
while (true) { while (true) {
$archiveUids = $repository->queuePendingArchiveTasks(100); $archiveUids = $repository->queuePendingArchiveTasks(100);
@ -36,28 +64,44 @@ try {
} }
foreach ($archiveUids as $uid) { foreach ($archiveUids as $uid) {
$handler->handle([ $processedChunkCount = $handler->handle([
'task_type' => 'search_index', 'task_type' => 'search_index',
'target_type' => 'archive', 'target_type' => 'archive',
'target_uid' => $uid, 'target_uid' => $uid,
'attempt' => 1, 'attempt' => 1,
]); ]);
$archiveCount++; if ($processedChunkCount <= 0) {
continue;
}
$batchCount++;
$indexedArchives[] = $uid; $indexedArchives[] = $uid;
$progress = $repository->countIndexedChunks($archiveUid);
renderProgress($progress, $totalChunks, 'Reindexing');
fwrite(STDOUT, PHP_EOL . sprintf(
'Batch #%d archive=%s chunks=%d progress=%d/%d%s',
$batchCount,
$uid,
$processedChunkCount,
$progress,
$totalChunks,
PHP_EOL
));
} }
} }
$indexedChunksQuery = Db::table('chunks')->where('search_index_status', 3); $indexedChunks = $repository->countIndexedChunks($archiveUid);
if ($archiveUid !== null && trim($archiveUid) !== '') { renderProgress($indexedChunks, $totalChunks, 'Reindexing', true);
$indexedChunksQuery->where('archive_uid', trim($archiveUid));
}
$indexedChunks = (int) $indexedChunksQuery->count();
echo 'OpenSearch reindex completed.' . PHP_EOL; echo 'OpenSearch reindex completed.' . PHP_EOL;
echo 'Index: ' . config('opensearch.indices.chunks', 'proofdb_chunks') . PHP_EOL; echo 'Index: ' . $indexName . PHP_EOL;
echo 'Archive filter: ' . ($archiveUid ?: '(all embedded archives)') . PHP_EOL; echo 'Archive filter: ' . ($archiveUid ?: '(all embedded archives)') . PHP_EOL;
echo 'Mode: ' . $mode . ($forceReset ? ' (--reset)' : (!$indexExists ? ' (index was missing)' : '')) . PHP_EOL;
echo 'Eligible embedded chunks: ' . $totalChunks . PHP_EOL;
echo 'OpenSearch bulk size: ' . $bulkSize . PHP_EOL;
echo 'Reset chunks: ' . $resetCount . PHP_EOL; echo 'Reset chunks: ' . $resetCount . PHP_EOL;
echo 'Indexed archives: ' . $archiveCount . PHP_EOL; echo 'Indexed archives: ' . count(array_unique($indexedArchives)) . PHP_EOL;
echo 'Processed batches: ' . $batchCount . PHP_EOL;
echo 'Indexed chunk rows now marked indexed: ' . $indexedChunks . PHP_EOL; echo 'Indexed chunk rows now marked indexed: ' . $indexedChunks . PHP_EOL;
if ($indexedArchives !== []) { if ($indexedArchives !== []) {
echo 'Archives: ' . implode(', ', $indexedArchives) . PHP_EOL; echo 'Archives: ' . implode(', ', $indexedArchives) . PHP_EOL;
@ -66,3 +110,19 @@ try {
fwrite(STDERR, $exception::class . ': ' . $exception->getMessage() . PHP_EOL); fwrite(STDERR, $exception::class . ': ' . $exception->getMessage() . PHP_EOL);
exit(1); exit(1);
} }
function renderProgress(int $done, int $total, string $label, bool $final = false): void
{
$total = max(1, $total);
$done = max(0, min($done, $total));
$width = 32;
$filled = (int) floor(($done / $total) * $width);
$bar = str_repeat('=', $filled) . str_repeat(' ', max(0, $width - $filled));
$percent = str_pad(number_format(($done / $total) * 100, 1), 5, ' ', STR_PAD_LEFT);
$line = sprintf("\r%s [%s] %s%% (%d/%d)", $label, $bar, $percent, $done, $total);
fwrite(STDOUT, $line);
if ($final || $done >= $total) {
fwrite(STDOUT, PHP_EOL);
}
}

View File

@ -20,14 +20,14 @@ CREATE TABLE IF NOT EXISTS archives (
series TEXT, series TEXT,
tags JSONB NOT NULL DEFAULT '[]'::jsonb, tags JSONB NOT NULL DEFAULT '[]'::jsonb,
metadata JSONB NOT NULL DEFAULT '{}'::jsonb, metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
content TEXT,
raw TEXT,
chunks JSONB NOT NULL DEFAULT '[]'::jsonb, chunks JSONB NOT NULL DEFAULT '[]'::jsonb,
created_time TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP, created_time TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_time TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP updated_time TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP
) )
SQL, SQL,
'ALTER TABLE archives ADD COLUMN IF NOT EXISTS summary TEXT', 'ALTER TABLE archives ADD COLUMN IF NOT EXISTS summary TEXT',
'ALTER TABLE archives DROP COLUMN IF EXISTS content',
'ALTER TABLE archives DROP COLUMN IF EXISTS raw',
<<<SQL <<<SQL
CREATE TABLE IF NOT EXISTS chunks ( CREATE TABLE IF NOT EXISTS chunks (
id BIGSERIAL PRIMARY KEY, id BIGSERIAL PRIMARY KEY,