114 lines
3.3 KiB
PHP
114 lines
3.3 KiB
PHP
<?php
|
|
|
|
namespace app\service\Search;
|
|
|
|
use OpenSearch\Client;
|
|
|
|
class OpenSearchChunkIndex
|
|
{
|
|
public function __construct(private readonly ?Client $client = null)
|
|
{
|
|
}
|
|
|
|
public function ensureExists(): void
|
|
{
|
|
$client = $this->client();
|
|
$index = $this->indexName();
|
|
|
|
if ($client->indices()->exists(['index' => $index])) {
|
|
return;
|
|
}
|
|
|
|
$client->indices()->create([
|
|
'index' => $index,
|
|
'body' => $this->mapping(),
|
|
]);
|
|
}
|
|
|
|
public function bulkIndex(array $documents): array
|
|
{
|
|
if ($documents === []) {
|
|
return ['items' => [], 'errors' => false];
|
|
}
|
|
|
|
$body = [];
|
|
foreach ($documents as $document) {
|
|
$body[] = [
|
|
'index' => [
|
|
'_index' => $this->indexName(),
|
|
'_id' => $document['chunk_uid'],
|
|
],
|
|
];
|
|
$body[] = $document;
|
|
}
|
|
|
|
return $this->client()->bulk([
|
|
'refresh' => config('opensearch.bulk.refresh', 'false'),
|
|
'body' => $body,
|
|
]);
|
|
}
|
|
|
|
public function mapping(): array
|
|
{
|
|
return [
|
|
'settings' => [
|
|
'index' => [
|
|
'knn' => true,
|
|
],
|
|
],
|
|
'mappings' => [
|
|
'properties' => [
|
|
'chunk_uid' => ['type' => 'keyword'],
|
|
'archive_uid' => ['type' => 'keyword'],
|
|
'chunk_index' => ['type' => 'integer'],
|
|
'page_start' => ['type' => 'integer'],
|
|
'page_end' => ['type' => 'integer'],
|
|
'title' => $this->textWithKeyword(),
|
|
'source' => $this->textWithKeyword(),
|
|
'author' => $this->textWithKeyword(),
|
|
'year' => ['type' => 'integer'],
|
|
'series' => $this->textWithKeyword(),
|
|
'tags' => ['type' => 'keyword'],
|
|
'text' => ['type' => 'text'],
|
|
'embedding' => [
|
|
'type' => 'knn_vector',
|
|
'dimension' => (int) config('opensearch.vector.dimensions', 2048),
|
|
'method' => [
|
|
'name' => 'hnsw',
|
|
'space_type' => config('opensearch.vector.space_type', 'cosinesimil'),
|
|
'engine' => config('opensearch.vector.engine', 'lucene'),
|
|
],
|
|
],
|
|
'embedding_model' => ['type' => 'keyword'],
|
|
'embedding_dimensions' => ['type' => 'integer'],
|
|
'created_time' => ['type' => 'date'],
|
|
'updated_time' => ['type' => 'date'],
|
|
],
|
|
],
|
|
];
|
|
}
|
|
|
|
private function client(): Client
|
|
{
|
|
return $this->client ?? (new OpenSearchClientFactory())->make();
|
|
}
|
|
|
|
private function indexName(): string
|
|
{
|
|
return config('opensearch.indices.chunks', 'proofdb_chunks');
|
|
}
|
|
|
|
private function textWithKeyword(): array
|
|
{
|
|
return [
|
|
'type' => 'text',
|
|
'fields' => [
|
|
'keyword' => [
|
|
'type' => 'keyword',
|
|
'ignore_above' => 512,
|
|
],
|
|
],
|
|
];
|
|
}
|
|
}
|