limit($payload['limit'] ?? self::DEFAULT_LIMIT); $filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : []; $body = [ 'size' => $limit, 'query' => [ 'bool' => [ 'must' => [ [ 'multi_match' => [ 'query' => $query, 'fields' => [ 'text^4', 'title^3', 'summary^2', 'source^2', 'author^2', 'series^2', 'tags^2', ], 'type' => 'best_fields', ], ], ], 'filter' => $this->filters($filters), ], ], '_source' => $this->sourceFields(), ]; $response = $this->client()->search([ 'index' => config('opensearch.indices.chunks', 'proofdb_chunks'), 'body' => $body, ]); return [ 'mode' => 'fulltext', 'query' => $query, 'limit' => $limit, 'filters' => $filters, 'total' => $this->total($response), 'hits' => $this->hits($response), ]; } public function vector(array $payload): array { $query = trim((string) ($payload['query'] ?? '')); if ($query === '') { throw new InvalidArgumentException('query is required.'); } $limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT); $k = $this->limit($payload['k'] ?? $limit); $filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : []; $embedding = $this->queryEmbedding($query); $response = $this->client()->search([ 'index' => config('opensearch.indices.chunks', 'proofdb_chunks'), 'body' => [ 'size' => $limit, 'query' => [ 'bool' => [ 'must' => [ [ 'knn' => [ 'embedding' => [ 'vector' => $embedding, 'k' => $k, ], ], ], ], 'filter' => $this->filters($filters), ], ], '_source' => $this->sourceFields(), ], ]); return [ 'mode' => 'vector', 'query' => $query, 'limit' => $limit, 'k' => $k, 'filters' => $filters, 'embedding_model' => config('LLMapi.embedding.model', 'embedding-3'), 'embedding_dimensions' => count($embedding), 'total' => $this->total($response), 'hits' => $this->hits($response), ]; } public function hybrid(array $payload): array { $query = trim((string) ($payload['query'] ?? '')); if ($query === '') { throw new InvalidArgumentException('query is required.'); } $limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT); $candidateLimit = $this->limit($payload['candidate_limit'] ?? max($limit * 3, 20)); $rrfK = max(1, (int) ($payload['rrf_k'] ?? self::DEFAULT_RRF_K)); $filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : []; $aiKeywords = null; $fulltextQuery = $query; if ($this->aiEnabled($payload)) { $aiKeywords = $this->keywordService()->generate($query); $fulltextQuery = trim((string) ($aiKeywords['query'] ?? '')) ?: $query; } $basePayload = [ 'query' => $query, 'limit' => $candidateLimit, 'k' => $candidateLimit, 'filters' => $filters, ]; $fulltextPayload = $basePayload; $fulltextPayload['query'] = $fulltextQuery; $fulltext = $this->fulltext($fulltextPayload); $vector = $this->vector($basePayload); $hits = $this->rrf($fulltext['hits'], $vector['hits'], $rrfK); return [ 'mode' => 'hybrid', 'query' => $query, 'limit' => $limit, 'candidate_limit' => $candidateLimit, 'rrf_k' => $rrfK, 'filters' => $filters, 'ai' => $aiKeywords !== null, 'fulltext_query' => $fulltextQuery, 'vector_query' => $query, 'keywords' => $aiKeywords, 'total' => count($hits), 'hits' => array_slice($hits, 0, $limit), 'sources' => [ 'fulltext_total' => $fulltext['total'], 'vector_total' => $vector['total'], 'fulltext_hits' => count($fulltext['hits']), 'vector_hits' => count($vector['hits']), ], ]; } private function aiEnabled(array $payload): bool { return (bool) ($payload['ai'] ?? false) && (bool) config('LLMapi.search_keywords.enabled', true); } private function filters(array $filters): array { $clauses = []; foreach (['archive_uid', 'chunk_uid'] as $field) { if (!empty($filters[$field])) { $clauses[] = ['term' => [$field => (string) $filters[$field]]]; } } foreach (['source', 'author', 'series'] as $field) { if (!empty($filters[$field])) { $clauses[] = ['term' => [$field . '.keyword' => (string) $filters[$field]]]; } } if (isset($filters['year']) && is_numeric($filters['year'])) { $clauses[] = ['term' => ['year' => (int) $filters['year']]]; } if (!empty($filters['tags'])) { $tags = is_array($filters['tags']) ? $filters['tags'] : [$filters['tags']]; $tags = array_values(array_filter(array_map('strval', $tags))); if ($tags !== []) { $clauses[] = ['terms' => ['tags' => $tags]]; } } return $clauses; } private function hits(array $response): array { $hits = []; foreach ($response['hits']['hits'] ?? [] as $hit) { $source = is_array($hit['_source'] ?? null) ? $hit['_source'] : []; $hits[] = [ 'score' => (float) ($hit['_score'] ?? 0), 'chunk_uid' => $source['chunk_uid'] ?? null, 'archive_uid' => $source['archive_uid'] ?? null, 'chunk_index' => $source['chunk_index'] ?? null, 'page_start' => $source['page_start'] ?? null, 'page_end' => $source['page_end'] ?? null, 'title' => $source['title'] ?? null, 'summary' => $source['summary'] ?? null, 'source' => $source['source'] ?? null, 'author' => $source['author'] ?? null, 'year' => $source['year'] ?? null, 'series' => $source['series'] ?? null, 'tags' => $source['tags'] ?? [], 'text' => $source['text'] ?? '', 'embedding_model' => $source['embedding_model'] ?? null, 'embedding_dimensions' => $source['embedding_dimensions'] ?? null, ]; } return $hits; } private function rrf(array $fulltextHits, array $vectorHits, int $rrfK): array { $merged = []; $this->mergeRankedHits($merged, $fulltextHits, 'fulltext', $rrfK); $this->mergeRankedHits($merged, $vectorHits, 'vector', $rrfK); usort($merged, static function (array $a, array $b): int { $scoreCompare = ($b['hybrid_score'] ?? 0) <=> ($a['hybrid_score'] ?? 0); if ($scoreCompare !== 0) { return $scoreCompare; } return ($b['score'] ?? 0) <=> ($a['score'] ?? 0); }); return array_values($merged); } private function mergeRankedHits(array &$merged, array $hits, string $source, int $rrfK): void { foreach ($hits as $index => $hit) { $chunkUid = (string) ($hit['chunk_uid'] ?? ''); if ($chunkUid === '') { continue; } $rank = $index + 1; $contribution = 1 / ($rrfK + $rank); if (!isset($merged[$chunkUid])) { $merged[$chunkUid] = $hit; $merged[$chunkUid]['score'] = 0.0; $merged[$chunkUid]['hybrid_score'] = 0.0; $merged[$chunkUid]['rank_sources'] = []; } $merged[$chunkUid]['hybrid_score'] += $contribution; $merged[$chunkUid]['score'] = max((float) ($merged[$chunkUid]['score'] ?? 0), (float) ($hit['score'] ?? 0)); $merged[$chunkUid]['rank_sources'][$source] = [ 'rank' => $rank, 'score' => (float) ($hit['score'] ?? 0), 'rrf' => $contribution, ]; } } private function total(array $response): int { $total = $response['hits']['total'] ?? 0; if (is_array($total)) { return (int) ($total['value'] ?? 0); } return (int) $total; } private function limit(mixed $value): int { return min(self::MAX_LIMIT, max(1, (int) $value)); } private function queryEmbedding(string $query): array { $payload = $this->embeddingClient()->embed([$query], [ 'model' => config('LLMapi.embedding.model', 'embedding-3'), 'dimensions' => config('LLMapi.embedding.dimensions', 2048), ]); $embedding = $payload['data'][0]['embedding'] ?? null; if (!is_array($embedding)) { throw new InvalidArgumentException('query embedding could not be generated.'); } $dimensions = (int) config('opensearch.vector.dimensions', 2048); if (count($embedding) !== $dimensions) { throw new InvalidArgumentException("query embedding dimensions must be {$dimensions}."); } return array_map('floatval', $embedding); } private function sourceFields(): array { return [ 'chunk_uid', 'archive_uid', 'chunk_index', 'page_start', 'page_end', 'title', 'summary', 'source', 'author', 'year', 'series', 'tags', 'text', 'embedding_model', 'embedding_dimensions', ]; } private function client(): Client { return $this->client ?? (new OpenSearchClientFactory())->make(); } private function embeddingClient(): BigModelEmbeddingClient { return $this->embeddingClient ?? new BigModelEmbeddingClient(); } private function keywordService(): SearchKeywordService { return $this->keywordService ?? new SearchKeywordService(); } }