proofdb/app/service/Search/OpenSearchSearchService.php
2026-05-07 01:40:58 +08:00

351 lines
12 KiB
PHP

<?php
namespace app\service\Search;
use app\service\Embedding\BigModelEmbeddingClient;
use InvalidArgumentException;
use OpenSearch\Client;
class OpenSearchSearchService
{
private const DEFAULT_LIMIT = 10;
private const MAX_LIMIT = 50;
private const DEFAULT_RRF_K = 60;
public function __construct(
private readonly ?Client $client = null,
private readonly ?BigModelEmbeddingClient $embeddingClient = null,
private readonly ?SearchKeywordService $keywordService = null
) {
}
public function fulltext(array $payload): array
{
$query = trim((string) ($payload['query'] ?? ''));
if ($query === '') {
throw new InvalidArgumentException('query is required.');
}
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
$body = [
'size' => $limit,
'query' => [
'bool' => [
'must' => [
[
'multi_match' => [
'query' => $query,
'fields' => [
'text^4',
'title^3',
'source^2',
'author^2',
'series^2',
'tags^2',
],
'type' => 'best_fields',
],
],
],
'filter' => $this->filters($filters),
],
],
'_source' => $this->sourceFields(),
];
$response = $this->client()->search([
'index' => config('opensearch.indices.chunks', 'proofdb_chunks'),
'body' => $body,
]);
return [
'mode' => 'fulltext',
'query' => $query,
'limit' => $limit,
'filters' => $filters,
'total' => $this->total($response),
'hits' => $this->hits($response),
];
}
public function vector(array $payload): array
{
$query = trim((string) ($payload['query'] ?? ''));
if ($query === '') {
throw new InvalidArgumentException('query is required.');
}
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
$k = $this->limit($payload['k'] ?? $limit);
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
$embedding = $this->queryEmbedding($query);
$response = $this->client()->search([
'index' => config('opensearch.indices.chunks', 'proofdb_chunks'),
'body' => [
'size' => $limit,
'query' => [
'bool' => [
'must' => [
[
'knn' => [
'embedding' => [
'vector' => $embedding,
'k' => $k,
],
],
],
],
'filter' => $this->filters($filters),
],
],
'_source' => $this->sourceFields(),
],
]);
return [
'mode' => 'vector',
'query' => $query,
'limit' => $limit,
'k' => $k,
'filters' => $filters,
'embedding_model' => config('LLMapi.embedding.model', 'embedding-3'),
'embedding_dimensions' => count($embedding),
'total' => $this->total($response),
'hits' => $this->hits($response),
];
}
public function hybrid(array $payload): array
{
$query = trim((string) ($payload['query'] ?? ''));
if ($query === '') {
throw new InvalidArgumentException('query is required.');
}
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
$candidateLimit = $this->limit($payload['candidate_limit'] ?? max($limit * 3, 20));
$rrfK = max(1, (int) ($payload['rrf_k'] ?? self::DEFAULT_RRF_K));
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
$aiKeywords = null;
$fulltextQuery = $query;
if ($this->aiEnabled($payload)) {
$aiKeywords = $this->keywordService()->generate($query);
$fulltextQuery = trim((string) ($aiKeywords['query'] ?? '')) ?: $query;
}
$basePayload = [
'query' => $query,
'limit' => $candidateLimit,
'k' => $candidateLimit,
'filters' => $filters,
];
$fulltextPayload = $basePayload;
$fulltextPayload['query'] = $fulltextQuery;
$fulltext = $this->fulltext($fulltextPayload);
$vector = $this->vector($basePayload);
$hits = $this->rrf($fulltext['hits'], $vector['hits'], $rrfK);
return [
'mode' => 'hybrid',
'query' => $query,
'limit' => $limit,
'candidate_limit' => $candidateLimit,
'rrf_k' => $rrfK,
'filters' => $filters,
'ai' => $aiKeywords !== null,
'fulltext_query' => $fulltextQuery,
'vector_query' => $query,
'keywords' => $aiKeywords,
'total' => count($hits),
'hits' => array_slice($hits, 0, $limit),
'sources' => [
'fulltext_total' => $fulltext['total'],
'vector_total' => $vector['total'],
'fulltext_hits' => count($fulltext['hits']),
'vector_hits' => count($vector['hits']),
],
];
}
private function aiEnabled(array $payload): bool
{
return (bool) ($payload['ai'] ?? false) && (bool) config('LLMapi.search_keywords.enabled', true);
}
private function filters(array $filters): array
{
$clauses = [];
foreach (['archive_uid', 'chunk_uid'] as $field) {
if (!empty($filters[$field])) {
$clauses[] = ['term' => [$field => (string) $filters[$field]]];
}
}
foreach (['source', 'author', 'series'] as $field) {
if (!empty($filters[$field])) {
$clauses[] = ['term' => [$field . '.keyword' => (string) $filters[$field]]];
}
}
if (isset($filters['year']) && is_numeric($filters['year'])) {
$clauses[] = ['term' => ['year' => (int) $filters['year']]];
}
if (!empty($filters['tags'])) {
$tags = is_array($filters['tags']) ? $filters['tags'] : [$filters['tags']];
$tags = array_values(array_filter(array_map('strval', $tags)));
if ($tags !== []) {
$clauses[] = ['terms' => ['tags' => $tags]];
}
}
return $clauses;
}
private function hits(array $response): array
{
$hits = [];
foreach ($response['hits']['hits'] ?? [] as $hit) {
$source = is_array($hit['_source'] ?? null) ? $hit['_source'] : [];
$hits[] = [
'score' => (float) ($hit['_score'] ?? 0),
'chunk_uid' => $source['chunk_uid'] ?? null,
'archive_uid' => $source['archive_uid'] ?? null,
'chunk_index' => $source['chunk_index'] ?? null,
'page_start' => $source['page_start'] ?? null,
'page_end' => $source['page_end'] ?? null,
'title' => $source['title'] ?? null,
'source' => $source['source'] ?? null,
'author' => $source['author'] ?? null,
'year' => $source['year'] ?? null,
'series' => $source['series'] ?? null,
'tags' => $source['tags'] ?? [],
'text' => $source['text'] ?? '',
'embedding_model' => $source['embedding_model'] ?? null,
'embedding_dimensions' => $source['embedding_dimensions'] ?? null,
];
}
return $hits;
}
private function rrf(array $fulltextHits, array $vectorHits, int $rrfK): array
{
$merged = [];
$this->mergeRankedHits($merged, $fulltextHits, 'fulltext', $rrfK);
$this->mergeRankedHits($merged, $vectorHits, 'vector', $rrfK);
usort($merged, static function (array $a, array $b): int {
$scoreCompare = ($b['hybrid_score'] ?? 0) <=> ($a['hybrid_score'] ?? 0);
if ($scoreCompare !== 0) {
return $scoreCompare;
}
return ($b['score'] ?? 0) <=> ($a['score'] ?? 0);
});
return array_values($merged);
}
private function mergeRankedHits(array &$merged, array $hits, string $source, int $rrfK): void
{
foreach ($hits as $index => $hit) {
$chunkUid = (string) ($hit['chunk_uid'] ?? '');
if ($chunkUid === '') {
continue;
}
$rank = $index + 1;
$contribution = 1 / ($rrfK + $rank);
if (!isset($merged[$chunkUid])) {
$merged[$chunkUid] = $hit;
$merged[$chunkUid]['score'] = 0.0;
$merged[$chunkUid]['hybrid_score'] = 0.0;
$merged[$chunkUid]['rank_sources'] = [];
}
$merged[$chunkUid]['hybrid_score'] += $contribution;
$merged[$chunkUid]['score'] = max((float) ($merged[$chunkUid]['score'] ?? 0), (float) ($hit['score'] ?? 0));
$merged[$chunkUid]['rank_sources'][$source] = [
'rank' => $rank,
'score' => (float) ($hit['score'] ?? 0),
'rrf' => $contribution,
];
}
}
private function total(array $response): int
{
$total = $response['hits']['total'] ?? 0;
if (is_array($total)) {
return (int) ($total['value'] ?? 0);
}
return (int) $total;
}
private function limit(mixed $value): int
{
return min(self::MAX_LIMIT, max(1, (int) $value));
}
private function queryEmbedding(string $query): array
{
$payload = $this->embeddingClient()->embed([$query], [
'model' => config('LLMapi.embedding.model', 'embedding-3'),
'dimensions' => config('LLMapi.embedding.dimensions', 2048),
]);
$embedding = $payload['data'][0]['embedding'] ?? null;
if (!is_array($embedding)) {
throw new InvalidArgumentException('query embedding could not be generated.');
}
$dimensions = (int) config('opensearch.vector.dimensions', 2048);
if (count($embedding) !== $dimensions) {
throw new InvalidArgumentException("query embedding dimensions must be {$dimensions}.");
}
return array_map('floatval', $embedding);
}
private function sourceFields(): array
{
return [
'chunk_uid',
'archive_uid',
'chunk_index',
'page_start',
'page_end',
'title',
'source',
'author',
'year',
'series',
'tags',
'text',
'embedding_model',
'embedding_dimensions',
];
}
private function client(): Client
{
return $this->client ?? (new OpenSearchClientFactory())->make();
}
private function embeddingClient(): BigModelEmbeddingClient
{
return $this->embeddingClient ?? new BigModelEmbeddingClient();
}
private function keywordService(): SearchKeywordService
{
return $this->keywordService ?? new SearchKeywordService();
}
}