198 lines
7.0 KiB
PHP
198 lines
7.0 KiB
PHP
<?php
|
||
|
||
namespace app\service;
|
||
|
||
use app\service\LLM\OpenAICompatibleClient;
|
||
use app\service\LLM\LLMRetryQueue;
|
||
use Throwable;
|
||
|
||
class ArchiveMetadataEnrichmentService
|
||
{
|
||
private OpenAICompatibleClient $client;
|
||
private LLMRetryQueue $queue;
|
||
|
||
public function __construct(?OpenAICompatibleClient $client = null, ?LLMRetryQueue $queue = null)
|
||
{
|
||
$this->client = $client ?? new OpenAICompatibleClient();
|
||
$this->queue = $queue ?? new LLMRetryQueue();
|
||
}
|
||
|
||
public function enrich(array $payload): array
|
||
{
|
||
$missing = $this->missingFields($payload);
|
||
if ($missing === [] || !$this->enabled()) {
|
||
return $this->withAiMeta($payload, [
|
||
'enabled' => $this->enabled(),
|
||
'attempted' => false,
|
||
'filled' => [],
|
||
'missing' => $missing,
|
||
]);
|
||
}
|
||
|
||
try {
|
||
$result = $this->queue->run(
|
||
fn (): array => $this->client->chatJson($this->messages($payload, $missing), [
|
||
'model' => config('LLMapi.metadata.model'),
|
||
'temperature' => config('LLMapi.metadata.temperature', 0.1),
|
||
'max_tokens' => config('LLMapi.metadata.max_tokens', 1200),
|
||
'stream' => false,
|
||
'response_format' => config('LLMapi.metadata.response_format', ['type' => 'json_object']),
|
||
'thinking' => config('LLMapi.metadata.thinking', ['type' => 'disabled']),
|
||
'request_id' => $this->requestId($payload, $missing),
|
||
]),
|
||
config('LLMapi.metadata.retry', [])
|
||
);
|
||
} catch (Throwable $exception) {
|
||
return $this->withAiMeta($payload, [
|
||
'enabled' => true,
|
||
'attempted' => true,
|
||
'filled' => [],
|
||
'missing' => $missing,
|
||
'error' => $exception->getMessage(),
|
||
]);
|
||
}
|
||
|
||
$filled = [];
|
||
foreach ($missing as $field) {
|
||
if (!$this->hasUsefulValue($result, $field)) {
|
||
continue;
|
||
}
|
||
|
||
$payload[$field] = $this->normalizeField($field, $result[$field]);
|
||
$filled[] = $field;
|
||
}
|
||
|
||
return $this->withAiMeta($payload, [
|
||
'enabled' => true,
|
||
'attempted' => true,
|
||
'filled' => $filled,
|
||
'missing' => array_values(array_diff($missing, $filled)),
|
||
'model' => config('LLMapi.metadata.model'),
|
||
'stream' => false,
|
||
'response_format' => config('LLMapi.metadata.response_format', ['type' => 'json_object']),
|
||
'thinking' => config('LLMapi.metadata.thinking', ['type' => 'disabled']),
|
||
]);
|
||
}
|
||
|
||
private function missingFields(array $payload): array
|
||
{
|
||
$fields = ['title', 'year', 'author', 'tags', 'summary'];
|
||
return array_values(array_filter($fields, fn (string $field): bool => !$this->hasUsefulValue($payload, $field)));
|
||
}
|
||
|
||
private function enabled(): bool
|
||
{
|
||
return (bool) config('LLMapi.metadata.enabled', true) && $this->client->isConfigured();
|
||
}
|
||
|
||
private function messages(array $payload, array $missing): array
|
||
{
|
||
$text = $this->sampleText($payload);
|
||
|
||
return [
|
||
[
|
||
'role' => 'system',
|
||
'content' => implode("\n", [
|
||
'你是历史档案元数据整理助手。',
|
||
'你只能根据用户提供的档案文本抽取或推断元数据。',
|
||
'请只返回 JSON 对象,不要返回 Markdown,不要解释。',
|
||
'字段:title(string), year(integer|null), author(string|null), tags(array<string>), summary(string)。',
|
||
'summary 简洁概括档案内容,80-200 字。',
|
||
'tags 用档案中常见专名和涉及主题,5-10 个。',
|
||
'无法判断的字段返回 null 或空数组。',
|
||
'以上请均使用档案中的语言。',
|
||
]),
|
||
],
|
||
[
|
||
'role' => 'user',
|
||
'content' => json_encode([
|
||
'missing_fields' => $missing,
|
||
'known_fields' => [
|
||
'title' => $payload['title'] ?? null,
|
||
'year' => $payload['year'] ?? null,
|
||
'author' => $payload['author'] ?? null,
|
||
'source' => $payload['source'] ?? null,
|
||
'series' => $payload['series'] ?? null,
|
||
],
|
||
'archive_text_sample' => $text,
|
||
], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
|
||
],
|
||
];
|
||
}
|
||
|
||
private function requestId(array $payload, array $missing): string
|
||
{
|
||
return 'metadata-' . substr(hash('sha256', implode('|', [
|
||
(string) ($payload['source'] ?? ''),
|
||
(string) ($payload['archive_uid'] ?? ''),
|
||
mb_substr((string) ($payload['content'] ?? ''), 0, 1000),
|
||
implode(',', $missing ?? []),
|
||
])), 0, 32);
|
||
}
|
||
|
||
private function sampleText(array $payload): string
|
||
{
|
||
$text = '';
|
||
if (isset($payload['content']) && is_string($payload['content'])) {
|
||
$text = $payload['content'];
|
||
} elseif (isset($payload['pages']) && is_array($payload['pages'])) {
|
||
$parts = [];
|
||
foreach ($payload['pages'] as $page) {
|
||
if (isset($page['content']) && is_string($page['content'])) {
|
||
$parts[] = $page['content'];
|
||
}
|
||
}
|
||
$text = implode("\n\n", $parts);
|
||
}
|
||
|
||
$maxChars = (int) config('LLMapi.metadata.max_input_chars', 12000);
|
||
return mb_substr($text, 0, $maxChars);
|
||
}
|
||
|
||
private function hasUsefulValue(array $payload, string $field): bool
|
||
{
|
||
if (!array_key_exists($field, $payload)) {
|
||
return false;
|
||
}
|
||
|
||
if ($field === 'title' && (($payload['metadata']['title_source'] ?? null) === 'fallback')) {
|
||
return false;
|
||
}
|
||
|
||
$value = $payload[$field];
|
||
if (is_array($value)) {
|
||
return $value !== [];
|
||
}
|
||
|
||
if ($field === 'year') {
|
||
return is_numeric($value) && (int) $value > 0;
|
||
}
|
||
|
||
return is_string($value) ? trim($value) !== '' : $value !== null;
|
||
}
|
||
|
||
private function normalizeField(string $field, mixed $value): mixed
|
||
{
|
||
if ($field === 'year') {
|
||
return is_numeric($value) ? (int) $value : null;
|
||
}
|
||
|
||
if ($field === 'tags') {
|
||
if (!is_array($value)) {
|
||
return [];
|
||
}
|
||
return array_values(array_filter(array_map('strval', $value)));
|
||
}
|
||
|
||
return is_string($value) ? trim($value) : $value;
|
||
}
|
||
|
||
private function withAiMeta(array $payload, array $ai): array
|
||
{
|
||
$payload['metadata'] = is_array($payload['metadata'] ?? null) ? $payload['metadata'] : [];
|
||
$payload['metadata']['ai_enrichment'] = $ai;
|
||
|
||
return $payload;
|
||
}
|
||
}
|