176 lines
6.0 KiB
PHP
176 lines
6.0 KiB
PHP
<?php
|
|
|
|
namespace app\service\Search;
|
|
|
|
use app\service\Embedding\EmbeddingStatus;
|
|
use support\Db;
|
|
|
|
class ChunkSearchIndexRepository
|
|
{
|
|
public function resetEmbeddedChunksToPending(?string $archiveUid = null): int
|
|
{
|
|
$query = Db::table('chunks')
|
|
->where('embedding_status', EmbeddingStatus::EMBEDDED);
|
|
|
|
if ($archiveUid !== null && trim($archiveUid) !== '') {
|
|
$query->where('archive_uid', trim($archiveUid));
|
|
}
|
|
|
|
return $query->update([
|
|
'search_index_status' => SearchIndexStatus::PENDING,
|
|
'search_index_error' => null,
|
|
'search_index_updated_at' => null,
|
|
]);
|
|
}
|
|
|
|
public function queuePendingArchiveTasks(int $limit): array
|
|
{
|
|
$statuses = [
|
|
SearchIndexStatus::PENDING,
|
|
SearchIndexStatus::QUEUED,
|
|
SearchIndexStatus::INDEXING,
|
|
SearchIndexStatus::FAILED_RETRYABLE,
|
|
];
|
|
|
|
$archiveUids = Db::table('chunks')
|
|
->where('embedding_status', EmbeddingStatus::EMBEDDED)
|
|
->whereIn('search_index_status', $statuses)
|
|
->select('archive_uid')
|
|
->groupBy('archive_uid')
|
|
->orderByRaw('MIN(id)')
|
|
->limit($limit)
|
|
->pluck('archive_uid')
|
|
->all();
|
|
|
|
$archiveUids = array_values(array_filter(array_map('strval', $archiveUids)));
|
|
foreach ($archiveUids as $archiveUid) {
|
|
Db::table('chunks')
|
|
->where('archive_uid', $archiveUid)
|
|
->where('embedding_status', EmbeddingStatus::EMBEDDED)
|
|
->whereIn('search_index_status', $statuses)
|
|
->update([
|
|
'search_index_status' => SearchIndexStatus::QUEUED,
|
|
'search_index_error' => null,
|
|
'search_index_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
|
|
]);
|
|
}
|
|
|
|
return $archiveUids;
|
|
}
|
|
|
|
public function findQueuedDocuments(string $archiveUid, int $limit): array
|
|
{
|
|
$rows = Db::table('chunks')
|
|
->join('archives', 'chunks.archive_uid', '=', 'archives.archive_uid')
|
|
->where('chunks.archive_uid', $archiveUid)
|
|
->where('chunks.embedding_status', EmbeddingStatus::EMBEDDED)
|
|
->whereIn('chunks.search_index_status', [SearchIndexStatus::QUEUED, SearchIndexStatus::INDEXING])
|
|
->orderBy('chunks.chunk_index')
|
|
->limit($limit)
|
|
->get([
|
|
'chunks.chunk_uid',
|
|
'chunks.archive_uid',
|
|
'chunks.chunk_index',
|
|
'chunks.page_start',
|
|
'chunks.page_end',
|
|
'chunks.text',
|
|
'chunks.embedding_ref',
|
|
'chunks.embedding_model',
|
|
'chunks.created_time',
|
|
'chunks.updated_time',
|
|
'archives.title',
|
|
'archives.summary',
|
|
'archives.source',
|
|
'archives.author',
|
|
'archives.year',
|
|
'archives.series',
|
|
'archives.tags',
|
|
])
|
|
->all();
|
|
|
|
return array_map(fn (object $row): array => $this->documentFromRow($row), $rows);
|
|
}
|
|
|
|
public function markIndexing(array $chunkUids): void
|
|
{
|
|
$this->updateStatus($chunkUids, SearchIndexStatus::INDEXING, null);
|
|
}
|
|
|
|
public function markIndexed(array $chunkUids): void
|
|
{
|
|
$this->updateStatus($chunkUids, SearchIndexStatus::INDEXED, null);
|
|
}
|
|
|
|
public function markFailed(array $chunkUids, string $error, bool $retryable): void
|
|
{
|
|
$this->updateStatus(
|
|
$chunkUids,
|
|
$retryable ? SearchIndexStatus::FAILED_RETRYABLE : SearchIndexStatus::FAILED_TERMINAL,
|
|
$error
|
|
);
|
|
}
|
|
|
|
private function documentFromRow(object $row): array
|
|
{
|
|
$embeddingRef = $this->decodeJson($row->embedding_ref ?? null, []);
|
|
$embedding = is_array($embeddingRef['embedding'] ?? null) ? $embeddingRef['embedding'] : [];
|
|
|
|
return [
|
|
'chunk_uid' => (string) $row->chunk_uid,
|
|
'archive_uid' => (string) $row->archive_uid,
|
|
'chunk_index' => (int) $row->chunk_index,
|
|
'page_start' => $row->page_start === null ? null : (int) $row->page_start,
|
|
'page_end' => $row->page_end === null ? null : (int) $row->page_end,
|
|
'title' => $row->title,
|
|
'summary' => $row->summary,
|
|
'source' => $row->source,
|
|
'author' => $row->author,
|
|
'year' => $row->year === null ? null : (int) $row->year,
|
|
'series' => $row->series,
|
|
'tags' => $this->decodeJson($row->tags ?? null, []),
|
|
'text' => (string) $row->text,
|
|
'embedding' => array_map('floatval', $embedding),
|
|
'embedding_model' => (string) $row->embedding_model,
|
|
'embedding_dimensions' => count($embedding),
|
|
'created_time' => $this->dateString($row->created_time ?? null),
|
|
'updated_time' => $this->dateString($row->updated_time ?? null),
|
|
];
|
|
}
|
|
|
|
private function updateStatus(array $chunkUids, int $status, ?string $error): void
|
|
{
|
|
if ($chunkUids === []) {
|
|
return;
|
|
}
|
|
|
|
Db::table('chunks')->whereIn('chunk_uid', $chunkUids)->update([
|
|
'search_index_status' => $status,
|
|
'search_index_error' => $error === null ? null : mb_substr($error, 0, 4000),
|
|
'search_index_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
|
|
]);
|
|
}
|
|
|
|
private function decodeJson(mixed $value, array $fallback): array
|
|
{
|
|
if (is_array($value)) {
|
|
return $value;
|
|
}
|
|
|
|
if (!is_string($value) || trim($value) === '') {
|
|
return $fallback;
|
|
}
|
|
|
|
$decoded = json_decode($value, true);
|
|
return is_array($decoded) ? $decoded : $fallback;
|
|
}
|
|
|
|
private function dateString(mixed $value): ?string
|
|
{
|
|
if ($value === null) {
|
|
return null;
|
|
}
|
|
|
|
return date(DATE_ATOM, strtotime((string) $value) ?: time());
|
|
}
|
|
}
|