351 lines
12 KiB
PHP
351 lines
12 KiB
PHP
<?php
|
|
|
|
namespace app\service\Search;
|
|
|
|
use app\service\Embedding\BigModelEmbeddingClient;
|
|
use InvalidArgumentException;
|
|
use OpenSearch\Client;
|
|
|
|
class OpenSearchSearchService
|
|
{
|
|
private const DEFAULT_LIMIT = 10;
|
|
private const MAX_LIMIT = 50;
|
|
private const DEFAULT_RRF_K = 60;
|
|
|
|
public function __construct(
|
|
private readonly ?Client $client = null,
|
|
private readonly ?BigModelEmbeddingClient $embeddingClient = null,
|
|
private readonly ?SearchKeywordService $keywordService = null
|
|
) {
|
|
}
|
|
|
|
public function fulltext(array $payload): array
|
|
{
|
|
$query = trim((string) ($payload['query'] ?? ''));
|
|
if ($query === '') {
|
|
throw new InvalidArgumentException('query is required.');
|
|
}
|
|
|
|
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
|
|
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
|
|
$body = [
|
|
'size' => $limit,
|
|
'query' => [
|
|
'bool' => [
|
|
'must' => [
|
|
[
|
|
'multi_match' => [
|
|
'query' => $query,
|
|
'fields' => [
|
|
'text^4',
|
|
'title^3',
|
|
'source^2',
|
|
'author^2',
|
|
'series^2',
|
|
'tags^2',
|
|
],
|
|
'type' => 'best_fields',
|
|
],
|
|
],
|
|
],
|
|
'filter' => $this->filters($filters),
|
|
],
|
|
],
|
|
'_source' => $this->sourceFields(),
|
|
];
|
|
|
|
$response = $this->client()->search([
|
|
'index' => config('opensearch.indices.chunks', 'proofdb_chunks'),
|
|
'body' => $body,
|
|
]);
|
|
|
|
return [
|
|
'mode' => 'fulltext',
|
|
'query' => $query,
|
|
'limit' => $limit,
|
|
'filters' => $filters,
|
|
'total' => $this->total($response),
|
|
'hits' => $this->hits($response),
|
|
];
|
|
}
|
|
|
|
public function vector(array $payload): array
|
|
{
|
|
$query = trim((string) ($payload['query'] ?? ''));
|
|
if ($query === '') {
|
|
throw new InvalidArgumentException('query is required.');
|
|
}
|
|
|
|
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
|
|
$k = $this->limit($payload['k'] ?? $limit);
|
|
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
|
|
$embedding = $this->queryEmbedding($query);
|
|
|
|
$response = $this->client()->search([
|
|
'index' => config('opensearch.indices.chunks', 'proofdb_chunks'),
|
|
'body' => [
|
|
'size' => $limit,
|
|
'query' => [
|
|
'bool' => [
|
|
'must' => [
|
|
[
|
|
'knn' => [
|
|
'embedding' => [
|
|
'vector' => $embedding,
|
|
'k' => $k,
|
|
],
|
|
],
|
|
],
|
|
],
|
|
'filter' => $this->filters($filters),
|
|
],
|
|
],
|
|
'_source' => $this->sourceFields(),
|
|
],
|
|
]);
|
|
|
|
return [
|
|
'mode' => 'vector',
|
|
'query' => $query,
|
|
'limit' => $limit,
|
|
'k' => $k,
|
|
'filters' => $filters,
|
|
'embedding_model' => config('LLMapi.embedding.model', 'embedding-3'),
|
|
'embedding_dimensions' => count($embedding),
|
|
'total' => $this->total($response),
|
|
'hits' => $this->hits($response),
|
|
];
|
|
}
|
|
|
|
public function hybrid(array $payload): array
|
|
{
|
|
$query = trim((string) ($payload['query'] ?? ''));
|
|
if ($query === '') {
|
|
throw new InvalidArgumentException('query is required.');
|
|
}
|
|
|
|
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
|
|
$candidateLimit = $this->limit($payload['candidate_limit'] ?? max($limit * 3, 20));
|
|
$rrfK = max(1, (int) ($payload['rrf_k'] ?? self::DEFAULT_RRF_K));
|
|
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
|
|
$aiKeywords = null;
|
|
$fulltextQuery = $query;
|
|
if ($this->aiEnabled($payload)) {
|
|
$aiKeywords = $this->keywordService()->generate($query);
|
|
$fulltextQuery = trim((string) ($aiKeywords['query'] ?? '')) ?: $query;
|
|
}
|
|
|
|
$basePayload = [
|
|
'query' => $query,
|
|
'limit' => $candidateLimit,
|
|
'k' => $candidateLimit,
|
|
'filters' => $filters,
|
|
];
|
|
$fulltextPayload = $basePayload;
|
|
$fulltextPayload['query'] = $fulltextQuery;
|
|
|
|
$fulltext = $this->fulltext($fulltextPayload);
|
|
$vector = $this->vector($basePayload);
|
|
$hits = $this->rrf($fulltext['hits'], $vector['hits'], $rrfK);
|
|
|
|
return [
|
|
'mode' => 'hybrid',
|
|
'query' => $query,
|
|
'limit' => $limit,
|
|
'candidate_limit' => $candidateLimit,
|
|
'rrf_k' => $rrfK,
|
|
'filters' => $filters,
|
|
'ai' => $aiKeywords !== null,
|
|
'fulltext_query' => $fulltextQuery,
|
|
'vector_query' => $query,
|
|
'keywords' => $aiKeywords,
|
|
'total' => count($hits),
|
|
'hits' => array_slice($hits, 0, $limit),
|
|
'sources' => [
|
|
'fulltext_total' => $fulltext['total'],
|
|
'vector_total' => $vector['total'],
|
|
'fulltext_hits' => count($fulltext['hits']),
|
|
'vector_hits' => count($vector['hits']),
|
|
],
|
|
];
|
|
}
|
|
|
|
private function aiEnabled(array $payload): bool
|
|
{
|
|
return (bool) ($payload['ai'] ?? false) && (bool) config('LLMapi.search_keywords.enabled', true);
|
|
}
|
|
|
|
private function filters(array $filters): array
|
|
{
|
|
$clauses = [];
|
|
|
|
foreach (['archive_uid', 'chunk_uid'] as $field) {
|
|
if (!empty($filters[$field])) {
|
|
$clauses[] = ['term' => [$field => (string) $filters[$field]]];
|
|
}
|
|
}
|
|
|
|
foreach (['source', 'author', 'series'] as $field) {
|
|
if (!empty($filters[$field])) {
|
|
$clauses[] = ['term' => [$field . '.keyword' => (string) $filters[$field]]];
|
|
}
|
|
}
|
|
|
|
if (isset($filters['year']) && is_numeric($filters['year'])) {
|
|
$clauses[] = ['term' => ['year' => (int) $filters['year']]];
|
|
}
|
|
|
|
if (!empty($filters['tags'])) {
|
|
$tags = is_array($filters['tags']) ? $filters['tags'] : [$filters['tags']];
|
|
$tags = array_values(array_filter(array_map('strval', $tags)));
|
|
if ($tags !== []) {
|
|
$clauses[] = ['terms' => ['tags' => $tags]];
|
|
}
|
|
}
|
|
|
|
return $clauses;
|
|
}
|
|
|
|
private function hits(array $response): array
|
|
{
|
|
$hits = [];
|
|
foreach ($response['hits']['hits'] ?? [] as $hit) {
|
|
$source = is_array($hit['_source'] ?? null) ? $hit['_source'] : [];
|
|
$hits[] = [
|
|
'score' => (float) ($hit['_score'] ?? 0),
|
|
'chunk_uid' => $source['chunk_uid'] ?? null,
|
|
'archive_uid' => $source['archive_uid'] ?? null,
|
|
'chunk_index' => $source['chunk_index'] ?? null,
|
|
'page_start' => $source['page_start'] ?? null,
|
|
'page_end' => $source['page_end'] ?? null,
|
|
'title' => $source['title'] ?? null,
|
|
'source' => $source['source'] ?? null,
|
|
'author' => $source['author'] ?? null,
|
|
'year' => $source['year'] ?? null,
|
|
'series' => $source['series'] ?? null,
|
|
'tags' => $source['tags'] ?? [],
|
|
'text' => $source['text'] ?? '',
|
|
'embedding_model' => $source['embedding_model'] ?? null,
|
|
'embedding_dimensions' => $source['embedding_dimensions'] ?? null,
|
|
];
|
|
}
|
|
|
|
return $hits;
|
|
}
|
|
|
|
private function rrf(array $fulltextHits, array $vectorHits, int $rrfK): array
|
|
{
|
|
$merged = [];
|
|
$this->mergeRankedHits($merged, $fulltextHits, 'fulltext', $rrfK);
|
|
$this->mergeRankedHits($merged, $vectorHits, 'vector', $rrfK);
|
|
|
|
usort($merged, static function (array $a, array $b): int {
|
|
$scoreCompare = ($b['hybrid_score'] ?? 0) <=> ($a['hybrid_score'] ?? 0);
|
|
if ($scoreCompare !== 0) {
|
|
return $scoreCompare;
|
|
}
|
|
|
|
return ($b['score'] ?? 0) <=> ($a['score'] ?? 0);
|
|
});
|
|
|
|
return array_values($merged);
|
|
}
|
|
|
|
private function mergeRankedHits(array &$merged, array $hits, string $source, int $rrfK): void
|
|
{
|
|
foreach ($hits as $index => $hit) {
|
|
$chunkUid = (string) ($hit['chunk_uid'] ?? '');
|
|
if ($chunkUid === '') {
|
|
continue;
|
|
}
|
|
|
|
$rank = $index + 1;
|
|
$contribution = 1 / ($rrfK + $rank);
|
|
if (!isset($merged[$chunkUid])) {
|
|
$merged[$chunkUid] = $hit;
|
|
$merged[$chunkUid]['score'] = 0.0;
|
|
$merged[$chunkUid]['hybrid_score'] = 0.0;
|
|
$merged[$chunkUid]['rank_sources'] = [];
|
|
}
|
|
|
|
$merged[$chunkUid]['hybrid_score'] += $contribution;
|
|
$merged[$chunkUid]['score'] = max((float) ($merged[$chunkUid]['score'] ?? 0), (float) ($hit['score'] ?? 0));
|
|
$merged[$chunkUid]['rank_sources'][$source] = [
|
|
'rank' => $rank,
|
|
'score' => (float) ($hit['score'] ?? 0),
|
|
'rrf' => $contribution,
|
|
];
|
|
}
|
|
}
|
|
|
|
private function total(array $response): int
|
|
{
|
|
$total = $response['hits']['total'] ?? 0;
|
|
if (is_array($total)) {
|
|
return (int) ($total['value'] ?? 0);
|
|
}
|
|
|
|
return (int) $total;
|
|
}
|
|
|
|
private function limit(mixed $value): int
|
|
{
|
|
return min(self::MAX_LIMIT, max(1, (int) $value));
|
|
}
|
|
|
|
private function queryEmbedding(string $query): array
|
|
{
|
|
$payload = $this->embeddingClient()->embed([$query], [
|
|
'model' => config('LLMapi.embedding.model', 'embedding-3'),
|
|
'dimensions' => config('LLMapi.embedding.dimensions', 2048),
|
|
]);
|
|
|
|
$embedding = $payload['data'][0]['embedding'] ?? null;
|
|
if (!is_array($embedding)) {
|
|
throw new InvalidArgumentException('query embedding could not be generated.');
|
|
}
|
|
|
|
$dimensions = (int) config('opensearch.vector.dimensions', 2048);
|
|
if (count($embedding) !== $dimensions) {
|
|
throw new InvalidArgumentException("query embedding dimensions must be {$dimensions}.");
|
|
}
|
|
|
|
return array_map('floatval', $embedding);
|
|
}
|
|
|
|
private function sourceFields(): array
|
|
{
|
|
return [
|
|
'chunk_uid',
|
|
'archive_uid',
|
|
'chunk_index',
|
|
'page_start',
|
|
'page_end',
|
|
'title',
|
|
'source',
|
|
'author',
|
|
'year',
|
|
'series',
|
|
'tags',
|
|
'text',
|
|
'embedding_model',
|
|
'embedding_dimensions',
|
|
];
|
|
}
|
|
|
|
private function client(): Client
|
|
{
|
|
return $this->client ?? (new OpenSearchClientFactory())->make();
|
|
}
|
|
|
|
private function embeddingClient(): BigModelEmbeddingClient
|
|
{
|
|
return $this->embeddingClient ?? new BigModelEmbeddingClient();
|
|
}
|
|
|
|
private function keywordService(): SearchKeywordService
|
|
{
|
|
return $this->keywordService ?? new SearchKeywordService();
|
|
}
|
|
}
|