proofdb/app/service/Search/OpenSearchChunkIndex.php
2026-05-07 01:40:58 +08:00

114 lines
3.3 KiB
PHP

<?php
namespace app\service\Search;
use OpenSearch\Client;
class OpenSearchChunkIndex
{
public function __construct(private readonly ?Client $client = null)
{
}
public function ensureExists(): void
{
$client = $this->client();
$index = $this->indexName();
if ($client->indices()->exists(['index' => $index])) {
return;
}
$client->indices()->create([
'index' => $index,
'body' => $this->mapping(),
]);
}
public function bulkIndex(array $documents): array
{
if ($documents === []) {
return ['items' => [], 'errors' => false];
}
$body = [];
foreach ($documents as $document) {
$body[] = [
'index' => [
'_index' => $this->indexName(),
'_id' => $document['chunk_uid'],
],
];
$body[] = $document;
}
return $this->client()->bulk([
'refresh' => config('opensearch.bulk.refresh', 'false'),
'body' => $body,
]);
}
public function mapping(): array
{
return [
'settings' => [
'index' => [
'knn' => true,
],
],
'mappings' => [
'properties' => [
'chunk_uid' => ['type' => 'keyword'],
'archive_uid' => ['type' => 'keyword'],
'chunk_index' => ['type' => 'integer'],
'page_start' => ['type' => 'integer'],
'page_end' => ['type' => 'integer'],
'title' => $this->textWithKeyword(),
'source' => $this->textWithKeyword(),
'author' => $this->textWithKeyword(),
'year' => ['type' => 'integer'],
'series' => $this->textWithKeyword(),
'tags' => ['type' => 'keyword'],
'text' => ['type' => 'text'],
'embedding' => [
'type' => 'knn_vector',
'dimension' => (int) config('opensearch.vector.dimensions', 2048),
'method' => [
'name' => 'hnsw',
'space_type' => config('opensearch.vector.space_type', 'cosinesimil'),
'engine' => config('opensearch.vector.engine', 'lucene'),
],
],
'embedding_model' => ['type' => 'keyword'],
'embedding_dimensions' => ['type' => 'integer'],
'created_time' => ['type' => 'date'],
'updated_time' => ['type' => 'date'],
],
],
];
}
private function client(): Client
{
return $this->client ?? (new OpenSearchClientFactory())->make();
}
private function indexName(): string
{
return config('opensearch.indices.chunks', 'proofdb_chunks');
}
private function textWithKeyword(): array
{
return [
'type' => 'text',
'fields' => [
'keyword' => [
'type' => 'keyword',
'ignore_above' => 512,
],
],
];
}
}