proofdb/app/service/Embedding/ChunkEmbeddingHandler.php
2026-05-11 15:23:34 +08:00

112 lines
3.6 KiB
PHP

<?php
namespace app\service\Embedding;
use app\service\LLM\LLMRequestException;
use app\service\LLM\LLMRetryQueue;
use Throwable;
class ChunkEmbeddingHandler
{
private BigModelEmbeddingClient $client;
private ChunkEmbeddingRepository $chunks;
private LLMRetryQueue $retryQueue;
public function __construct(
?BigModelEmbeddingClient $client = null,
?ChunkEmbeddingRepository $chunks = null,
?LLMRetryQueue $retryQueue = null
) {
$this->client = $client ?? new BigModelEmbeddingClient();
$this->chunks = $chunks ?? new ChunkEmbeddingRepository();
$this->retryQueue = $retryQueue ?? new LLMRetryQueue();
}
public function handle(array $task): int
{
if (($task['target_type'] ?? null) !== 'archive') {
return 0;
}
$archiveUid = trim((string) ($task['target_uid'] ?? ''));
if ($archiveUid === '') {
return 0;
}
$batchSize = (int) config('LLMapi.embedding.batch_size', 32);
$chunks = $this->chunks->findQueuedChunks($archiveUid, $batchSize);
if ($chunks === []) {
return 0;
}
$chunkUids = array_column($chunks, 'chunk_uid');
if (!$this->client->isConfigured()) {
$this->chunks->markFailed($chunkUids, 'BigModel embedding API is not configured.', false);
return count($chunkUids);
}
$this->chunks->markProcessing($chunkUids);
try {
$payload = $this->retryQueue->run(
fn (): array => $this->client->embed(array_column($chunks, 'text'), [
'model' => config('LLMapi.embedding.model', 'embedding-3'),
'dimensions' => config('LLMapi.embedding.dimensions', 2048),
]),
config('LLMapi.embedding.retry', [])
);
$this->persistEmbeddings($chunks, $payload);
return count($chunkUids);
} catch (Throwable $exception) {
$this->chunks->markFailed($chunkUids, $exception->getMessage(), $this->isRetryable($exception));
throw $exception;
}
}
private function persistEmbeddings(array $chunks, array $payload): void
{
$model = (string) ($payload['model'] ?? config('LLMapi.embedding.model', 'embedding-3'));
$usage = is_array($payload['usage'] ?? null) ? $payload['usage'] : [];
$results = [];
foreach ($payload['data'] ?? [] as $item) {
if (!is_array($item) || !isset($item['index'], $item['embedding']) || !is_array($item['embedding'])) {
continue;
}
$results[(int) $item['index']] = $item['embedding'];
}
foreach ($chunks as $index => $chunk) {
if (!isset($results[$index])) {
$this->chunks->markFailed([$chunk['chunk_uid']], 'Embedding response missing index ' . $index, true);
continue;
}
$embedding = $results[$index];
$this->chunks->markEmbedded($chunk['chunk_uid'], [
'provider' => 'bigmodel',
'model' => $model,
'dimensions' => count($embedding),
'embedding' => $embedding,
'usage' => $usage,
'embedded_at' => date(DATE_ATOM),
], $model);
}
}
private function isRetryable(Throwable $exception): bool
{
if (!$exception instanceof LLMRequestException) {
return true;
}
$statusCode = $exception->statusCode();
if ($statusCode === null) {
return true;
}
return $statusCode === 429 || $statusCode >= 500;
}
}