where('embedding_status', EmbeddingStatus::EMBEDDED) ->whereIn('search_index_status', $statuses) ->select('archive_uid') ->groupBy('archive_uid') ->orderByRaw('MIN(id)') ->limit($limit) ->pluck('archive_uid') ->all(); $archiveUids = array_values(array_filter(array_map('strval', $archiveUids))); foreach ($archiveUids as $archiveUid) { Db::table('chunks') ->where('archive_uid', $archiveUid) ->where('embedding_status', EmbeddingStatus::EMBEDDED) ->whereIn('search_index_status', $statuses) ->update([ 'search_index_status' => SearchIndexStatus::QUEUED, 'search_index_error' => null, 'search_index_updated_at' => Db::raw('CURRENT_TIMESTAMP'), ]); } return $archiveUids; } public function findQueuedDocuments(string $archiveUid, int $limit): array { $rows = Db::table('chunks') ->join('archives', 'chunks.archive_uid', '=', 'archives.archive_uid') ->where('chunks.archive_uid', $archiveUid) ->where('chunks.embedding_status', EmbeddingStatus::EMBEDDED) ->whereIn('chunks.search_index_status', [SearchIndexStatus::QUEUED, SearchIndexStatus::INDEXING]) ->orderBy('chunks.chunk_index') ->limit($limit) ->get([ 'chunks.chunk_uid', 'chunks.archive_uid', 'chunks.chunk_index', 'chunks.page_start', 'chunks.page_end', 'chunks.text', 'chunks.embedding_ref', 'chunks.embedding_model', 'chunks.created_time', 'chunks.updated_time', 'archives.title', 'archives.source', 'archives.author', 'archives.year', 'archives.series', 'archives.tags', ]) ->all(); return array_map(fn (object $row): array => $this->documentFromRow($row), $rows); } public function markIndexing(array $chunkUids): void { $this->updateStatus($chunkUids, SearchIndexStatus::INDEXING, null); } public function markIndexed(array $chunkUids): void { $this->updateStatus($chunkUids, SearchIndexStatus::INDEXED, null); } public function markFailed(array $chunkUids, string $error, bool $retryable): void { $this->updateStatus( $chunkUids, $retryable ? SearchIndexStatus::FAILED_RETRYABLE : SearchIndexStatus::FAILED_TERMINAL, $error ); } private function documentFromRow(object $row): array { $embeddingRef = $this->decodeJson($row->embedding_ref ?? null, []); $embedding = is_array($embeddingRef['embedding'] ?? null) ? $embeddingRef['embedding'] : []; return [ 'chunk_uid' => (string) $row->chunk_uid, 'archive_uid' => (string) $row->archive_uid, 'chunk_index' => (int) $row->chunk_index, 'page_start' => $row->page_start === null ? null : (int) $row->page_start, 'page_end' => $row->page_end === null ? null : (int) $row->page_end, 'title' => $row->title, 'source' => $row->source, 'author' => $row->author, 'year' => $row->year === null ? null : (int) $row->year, 'series' => $row->series, 'tags' => $this->decodeJson($row->tags ?? null, []), 'text' => (string) $row->text, 'embedding' => array_map('floatval', $embedding), 'embedding_model' => (string) $row->embedding_model, 'embedding_dimensions' => count($embedding), 'created_time' => $this->dateString($row->created_time ?? null), 'updated_time' => $this->dateString($row->updated_time ?? null), ]; } private function updateStatus(array $chunkUids, int $status, ?string $error): void { if ($chunkUids === []) { return; } Db::table('chunks')->whereIn('chunk_uid', $chunkUids)->update([ 'search_index_status' => $status, 'search_index_error' => $error === null ? null : mb_substr($error, 0, 4000), 'search_index_updated_at' => Db::raw('CURRENT_TIMESTAMP'), ]); } private function decodeJson(mixed $value, array $fallback): array { if (is_array($value)) { return $value; } if (!is_string($value) || trim($value) === '') { return $fallback; } $decoded = json_decode($value, true); return is_array($decoded) ? $decoded : $fallback; } private function dateString(mixed $value): ?string { if ($value === null) { return null; } return date(DATE_ATOM, strtotime((string) $value) ?: time()); } }