proofdb/app/service/Search/ChunkSearchIndexRepository.php
2026-05-07 01:40:58 +08:00

158 lines
5.4 KiB
PHP

<?php
namespace app\service\Search;
use app\service\Embedding\EmbeddingStatus;
use support\Db;
class ChunkSearchIndexRepository
{
public function queuePendingArchiveTasks(int $limit): array
{
$statuses = [
SearchIndexStatus::PENDING,
SearchIndexStatus::QUEUED,
SearchIndexStatus::INDEXING,
SearchIndexStatus::FAILED_RETRYABLE,
];
$archiveUids = Db::table('chunks')
->where('embedding_status', EmbeddingStatus::EMBEDDED)
->whereIn('search_index_status', $statuses)
->select('archive_uid')
->groupBy('archive_uid')
->orderByRaw('MIN(id)')
->limit($limit)
->pluck('archive_uid')
->all();
$archiveUids = array_values(array_filter(array_map('strval', $archiveUids)));
foreach ($archiveUids as $archiveUid) {
Db::table('chunks')
->where('archive_uid', $archiveUid)
->where('embedding_status', EmbeddingStatus::EMBEDDED)
->whereIn('search_index_status', $statuses)
->update([
'search_index_status' => SearchIndexStatus::QUEUED,
'search_index_error' => null,
'search_index_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
]);
}
return $archiveUids;
}
public function findQueuedDocuments(string $archiveUid, int $limit): array
{
$rows = Db::table('chunks')
->join('archives', 'chunks.archive_uid', '=', 'archives.archive_uid')
->where('chunks.archive_uid', $archiveUid)
->where('chunks.embedding_status', EmbeddingStatus::EMBEDDED)
->whereIn('chunks.search_index_status', [SearchIndexStatus::QUEUED, SearchIndexStatus::INDEXING])
->orderBy('chunks.chunk_index')
->limit($limit)
->get([
'chunks.chunk_uid',
'chunks.archive_uid',
'chunks.chunk_index',
'chunks.page_start',
'chunks.page_end',
'chunks.text',
'chunks.embedding_ref',
'chunks.embedding_model',
'chunks.created_time',
'chunks.updated_time',
'archives.title',
'archives.source',
'archives.author',
'archives.year',
'archives.series',
'archives.tags',
])
->all();
return array_map(fn (object $row): array => $this->documentFromRow($row), $rows);
}
public function markIndexing(array $chunkUids): void
{
$this->updateStatus($chunkUids, SearchIndexStatus::INDEXING, null);
}
public function markIndexed(array $chunkUids): void
{
$this->updateStatus($chunkUids, SearchIndexStatus::INDEXED, null);
}
public function markFailed(array $chunkUids, string $error, bool $retryable): void
{
$this->updateStatus(
$chunkUids,
$retryable ? SearchIndexStatus::FAILED_RETRYABLE : SearchIndexStatus::FAILED_TERMINAL,
$error
);
}
private function documentFromRow(object $row): array
{
$embeddingRef = $this->decodeJson($row->embedding_ref ?? null, []);
$embedding = is_array($embeddingRef['embedding'] ?? null) ? $embeddingRef['embedding'] : [];
return [
'chunk_uid' => (string) $row->chunk_uid,
'archive_uid' => (string) $row->archive_uid,
'chunk_index' => (int) $row->chunk_index,
'page_start' => $row->page_start === null ? null : (int) $row->page_start,
'page_end' => $row->page_end === null ? null : (int) $row->page_end,
'title' => $row->title,
'source' => $row->source,
'author' => $row->author,
'year' => $row->year === null ? null : (int) $row->year,
'series' => $row->series,
'tags' => $this->decodeJson($row->tags ?? null, []),
'text' => (string) $row->text,
'embedding' => array_map('floatval', $embedding),
'embedding_model' => (string) $row->embedding_model,
'embedding_dimensions' => count($embedding),
'created_time' => $this->dateString($row->created_time ?? null),
'updated_time' => $this->dateString($row->updated_time ?? null),
];
}
private function updateStatus(array $chunkUids, int $status, ?string $error): void
{
if ($chunkUids === []) {
return;
}
Db::table('chunks')->whereIn('chunk_uid', $chunkUids)->update([
'search_index_status' => $status,
'search_index_error' => $error === null ? null : mb_substr($error, 0, 4000),
'search_index_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
]);
}
private function decodeJson(mixed $value, array $fallback): array
{
if (is_array($value)) {
return $value;
}
if (!is_string($value) || trim($value) === '') {
return $fallback;
}
$decoded = json_decode($value, true);
return is_array($decoded) ? $decoded : $fallback;
}
private function dateString(mixed $value): ?string
{
if ($value === null) {
return null;
}
return date(DATE_ATOM, strtotime((string) $value) ?: time());
}
}