proofdb/app/service/ArchiveMetadataEnrichmentService.php
2026-05-01 23:40:14 +08:00

198 lines
7.0 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace app\service;
use app\service\LLM\OpenAICompatibleClient;
use app\service\LLM\LLMRetryQueue;
use Throwable;
class ArchiveMetadataEnrichmentService
{
private OpenAICompatibleClient $client;
private LLMRetryQueue $queue;
public function __construct(?OpenAICompatibleClient $client = null, ?LLMRetryQueue $queue = null)
{
$this->client = $client ?? new OpenAICompatibleClient();
$this->queue = $queue ?? new LLMRetryQueue();
}
public function enrich(array $payload): array
{
$missing = $this->missingFields($payload);
if ($missing === [] || !$this->enabled()) {
return $this->withAiMeta($payload, [
'enabled' => $this->enabled(),
'attempted' => false,
'filled' => [],
'missing' => $missing,
]);
}
try {
$result = $this->queue->run(
fn (): array => $this->client->chatJson($this->messages($payload, $missing), [
'model' => config('LLMapi.metadata.model'),
'temperature' => config('LLMapi.metadata.temperature', 0.1),
'max_tokens' => config('LLMapi.metadata.max_tokens', 1200),
'stream' => false,
'response_format' => config('LLMapi.metadata.response_format', ['type' => 'json_object']),
'thinking' => config('LLMapi.metadata.thinking', ['type' => 'disabled']),
'request_id' => $this->requestId($payload, $missing),
]),
config('LLMapi.metadata.retry', [])
);
} catch (Throwable $exception) {
return $this->withAiMeta($payload, [
'enabled' => true,
'attempted' => true,
'filled' => [],
'missing' => $missing,
'error' => $exception->getMessage(),
]);
}
$filled = [];
foreach ($missing as $field) {
if (!$this->hasUsefulValue($result, $field)) {
continue;
}
$payload[$field] = $this->normalizeField($field, $result[$field]);
$filled[] = $field;
}
return $this->withAiMeta($payload, [
'enabled' => true,
'attempted' => true,
'filled' => $filled,
'missing' => array_values(array_diff($missing, $filled)),
'model' => config('LLMapi.metadata.model'),
'stream' => false,
'response_format' => config('LLMapi.metadata.response_format', ['type' => 'json_object']),
'thinking' => config('LLMapi.metadata.thinking', ['type' => 'disabled']),
]);
}
private function missingFields(array $payload): array
{
$fields = ['title', 'year', 'author', 'tags', 'summary'];
return array_values(array_filter($fields, fn (string $field): bool => !$this->hasUsefulValue($payload, $field)));
}
private function enabled(): bool
{
return (bool) config('LLMapi.metadata.enabled', true) && $this->client->isConfigured();
}
private function messages(array $payload, array $missing): array
{
$text = $this->sampleText($payload);
return [
[
'role' => 'system',
'content' => implode("\n", [
'你是历史档案元数据整理助手。',
'你只能根据用户提供的档案文本抽取或推断元数据。',
'请只返回 JSON 对象,不要返回 Markdown不要解释。',
'字段title(string), year(integer|null), author(string|null), tags(array<string>), summary(string)。',
'summary 简洁概括档案内容80-200 字。',
'tags 用档案中常见专名和涉及主题5-10 个。',
'无法判断的字段返回 null 或空数组。',
'以上请均使用档案中的语言。',
]),
],
[
'role' => 'user',
'content' => json_encode([
'missing_fields' => $missing,
'known_fields' => [
'title' => $payload['title'] ?? null,
'year' => $payload['year'] ?? null,
'author' => $payload['author'] ?? null,
'source' => $payload['source'] ?? null,
'series' => $payload['series'] ?? null,
],
'archive_text_sample' => $text,
], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
],
];
}
private function requestId(array $payload, array $missing): string
{
return 'metadata-' . substr(hash('sha256', implode('|', [
(string) ($payload['source'] ?? ''),
(string) ($payload['archive_uid'] ?? ''),
mb_substr((string) ($payload['content'] ?? ''), 0, 1000),
implode(',', $missing ?? []),
])), 0, 32);
}
private function sampleText(array $payload): string
{
$text = '';
if (isset($payload['content']) && is_string($payload['content'])) {
$text = $payload['content'];
} elseif (isset($payload['pages']) && is_array($payload['pages'])) {
$parts = [];
foreach ($payload['pages'] as $page) {
if (isset($page['content']) && is_string($page['content'])) {
$parts[] = $page['content'];
}
}
$text = implode("\n\n", $parts);
}
$maxChars = (int) config('LLMapi.metadata.max_input_chars', 12000);
return mb_substr($text, 0, $maxChars);
}
private function hasUsefulValue(array $payload, string $field): bool
{
if (!array_key_exists($field, $payload)) {
return false;
}
if ($field === 'title' && (($payload['metadata']['title_source'] ?? null) === 'fallback')) {
return false;
}
$value = $payload[$field];
if (is_array($value)) {
return $value !== [];
}
if ($field === 'year') {
return is_numeric($value) && (int) $value > 0;
}
return is_string($value) ? trim($value) !== '' : $value !== null;
}
private function normalizeField(string $field, mixed $value): mixed
{
if ($field === 'year') {
return is_numeric($value) ? (int) $value : null;
}
if ($field === 'tags') {
if (!is_array($value)) {
return [];
}
return array_values(array_filter(array_map('strval', $value)));
}
return is_string($value) ? trim($value) : $value;
}
private function withAiMeta(array $payload, array $ai): array
{
$payload['metadata'] = is_array($payload['metadata'] ?? null) ? $payload['metadata'] : [];
$payload['metadata']['ai_enrichment'] = $ai;
return $payload;
}
}