proofdb/app/service/ArchiveRepository.php
2026-05-08 00:05:51 +08:00

300 lines
11 KiB
PHP

<?php
namespace app\service;
use support\Db;
class ArchiveRepository
{
public function saveImport(array $import): void
{
Db::transaction(function () use ($import): void {
$archive = $import['archive'];
$chunks = $import['chunks'];
$chunkUids = array_column($chunks, 'chunk_uid');
Db::table('archives')->updateOrInsert(
['archive_uid' => $archive['archive_uid']],
[
'title' => $archive['title'] ?? null,
'summary' => $archive['summary'] ?? null,
'year' => $archive['year'] ?? null,
'author' => $archive['author'] ?? null,
'source' => $archive['source'] ?? null,
'series' => $archive['series'] ?? null,
'tags' => json_encode($archive['tags'] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
'metadata' => json_encode($archive['metadata'] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
'content' => $archive['content'] ?? null,
'raw' => $archive['raw'] ?? null,
'chunks' => json_encode($chunkUids, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
]
);
Db::table('chunks')->where('archive_uid', $archive['archive_uid'])->delete();
foreach ($chunks as $chunk) {
Db::table('chunks')->insert([
'chunk_uid' => $chunk['chunk_uid'],
'archive_uid' => $archive['archive_uid'],
'chunk_index' => $chunk['chunk_index'],
'page_start' => $chunk['page_start'],
'page_end' => $chunk['page_end'],
'text' => $chunk['text'],
'length' => $chunk['length'],
'embedding_status' => 0,
'embedding_ref' => null,
'embedding_model' => null,
'embedding_error' => null,
'embedding_updated_at' => null,
'search_index_status' => 0,
'search_index_error' => null,
'search_index_updated_at' => null,
]);
}
});
}
public function findArchive(string $archiveUid): ?array
{
$archive = Db::table('archives')->where('archive_uid', $archiveUid)->first();
if (!$archive) {
return null;
}
return $this->archiveToArray($archive);
}
public function findChunksText(string $archiveUid, int $limit = 20): string
{
$chunks = Db::table('chunks')
->where('archive_uid', $archiveUid)
->orderBy('chunk_index')
->limit($limit)
->get(['text'])
->all();
return implode("\n\n", array_map(fn ($chunk): string => (string) $chunk->text, $chunks));
}
public function findChunk(string $chunkUid): ?array
{
$row = Db::table('chunks')
->join('archives', 'chunks.archive_uid', '=', 'archives.archive_uid')
->where('chunks.chunk_uid', $chunkUid)
->first([
'chunks.chunk_uid',
'chunks.archive_uid',
'chunks.chunk_index',
'chunks.page_start',
'chunks.page_end',
'chunks.text',
'chunks.length',
'chunks.embedding_status',
'chunks.embedding_ref',
'chunks.embedding_model',
'chunks.embedding_error',
'chunks.search_index_status',
'chunks.search_index_error',
'archives.title',
'archives.summary',
'archives.year',
'archives.author',
'archives.source',
'archives.series',
'archives.tags',
'archives.metadata',
]);
if (!$row) {
return null;
}
return [
'chunk_uid' => (string) $row->chunk_uid,
'archive_uid' => (string) $row->archive_uid,
'chunk_index' => (int) $row->chunk_index,
'page_start' => $row->page_start === null ? null : (int) $row->page_start,
'page_end' => $row->page_end === null ? null : (int) $row->page_end,
'pages' => $this->pages($row->page_start, $row->page_end),
'text' => (string) $row->text,
'length' => $row->length === null ? null : (int) $row->length,
'embedding_status' => (int) $row->embedding_status,
'embedding_ref' => $this->decodeJson($row->embedding_ref ?? null, null),
'embedding_model' => $row->embedding_model,
'embedding_error' => $row->embedding_error,
'search_index_status' => (int) $row->search_index_status,
'search_index_error' => $row->search_index_error,
'archive' => [
'archive_uid' => (string) $row->archive_uid,
'title' => $row->title,
'summary' => $row->summary,
'year' => $row->year === null ? null : (int) $row->year,
'author' => $row->author,
'source' => $row->source,
'series' => $row->series,
'tags' => $this->decodeJson($row->tags ?? null, []),
'metadata' => $this->decodeJson($row->metadata ?? null, []),
],
];
}
public function findArchiveChunks(string $archiveUid): array
{
$rows = Db::table('chunks')
->join('archives', 'chunks.archive_uid', '=', 'archives.archive_uid')
->where('chunks.archive_uid', $archiveUid)
->orderBy('chunks.chunk_index')
->get([
'chunks.chunk_uid',
'chunks.archive_uid',
'chunks.chunk_index',
'chunks.page_start',
'chunks.page_end',
'chunks.text',
'chunks.length',
'chunks.embedding_status',
'chunks.embedding_ref',
'chunks.embedding_model',
'chunks.embedding_error',
'chunks.search_index_status',
'chunks.search_index_error',
'archives.title',
'archives.summary',
'archives.year',
'archives.author',
'archives.source',
'archives.series',
'archives.tags',
'archives.metadata',
])
->all();
return array_map(fn (object $row): array => $this->chunkRowToArray($row), $rows);
}
public function updateMetadata(string $archiveUid, array $fields, array $aiMeta): void
{
$archive = $this->findArchive($archiveUid);
$metadata = $archive['metadata'] ?? [];
$metadata['ai_enrichment'] = $aiMeta;
$updates = [
'metadata' => json_encode($metadata, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
];
foreach (['title', 'summary', 'year', 'author', 'series', 'tags'] as $field) {
if (!array_key_exists($field, $fields)) {
continue;
}
$updates[$field] = $field === 'tags'
? json_encode($fields[$field] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES)
: $fields[$field];
}
Db::table('archives')->where('archive_uid', $archiveUid)->update($updates);
}
public function archiveNeedsMetadata(array $archive): bool
{
foreach (['title', 'year', 'author', 'tags', 'summary'] as $field) {
$value = $archive[$field] ?? null;
if ($field === 'title' && (($archive['metadata']['title_source'] ?? null) === 'fallback')) {
return true;
}
if (is_array($value) && $value === []) {
return true;
}
if ($field === 'year' && (!$value || (int) $value <= 0)) {
return true;
}
if (!is_array($value) && ($value === null || trim((string) $value) === '')) {
return true;
}
}
return false;
}
private function archiveToArray(object $archive): array
{
return [
'archive_uid' => $archive->archive_uid,
'title' => $archive->title,
'summary' => $archive->summary,
'year' => $archive->year,
'author' => $archive->author,
'source' => $archive->source,
'series' => $archive->series,
'tags' => json_decode($archive->tags ?? '[]', true) ?: [],
'metadata' => json_decode($archive->metadata ?? '{}', true) ?: [],
'content' => $archive->content,
'raw' => $archive->raw,
'chunks' => json_decode($archive->chunks ?? '[]', true) ?: [],
];
}
private function chunkRowToArray(object $row): array
{
return [
'chunk_uid' => (string) $row->chunk_uid,
'archive_uid' => (string) $row->archive_uid,
'chunk_index' => (int) $row->chunk_index,
'page_start' => $row->page_start === null ? null : (int) $row->page_start,
'page_end' => $row->page_end === null ? null : (int) $row->page_end,
'pages' => $this->pages($row->page_start, $row->page_end),
'text' => (string) $row->text,
'length' => $row->length === null ? null : (int) $row->length,
'embedding_status' => (int) $row->embedding_status,
'embedding_ref' => $this->decodeJson($row->embedding_ref ?? null, null),
'embedding_model' => $row->embedding_model,
'embedding_error' => $row->embedding_error,
'search_index_status' => (int) $row->search_index_status,
'search_index_error' => $row->search_index_error,
'archive' => [
'archive_uid' => (string) $row->archive_uid,
'title' => $row->title,
'summary' => $row->summary,
'year' => $row->year === null ? null : (int) $row->year,
'author' => $row->author,
'source' => $row->source,
'series' => $row->series,
'tags' => $this->decodeJson($row->tags ?? null, []),
'metadata' => $this->decodeJson($row->metadata ?? null, []),
],
];
}
private function decodeJson(mixed $value, mixed $fallback): mixed
{
if ($value === null) {
return $fallback;
}
if (is_array($value)) {
return $value;
}
if (!is_string($value) || trim($value) === '') {
return $fallback;
}
$decoded = json_decode($value, true);
return $decoded === null && json_last_error() !== JSON_ERROR_NONE ? $fallback : $decoded;
}
private function pages(mixed $pageStart, mixed $pageEnd): array
{
if (!is_numeric($pageStart) || !is_numeric($pageEnd)) {
return array_values(array_filter([$pageStart, $pageEnd], static fn ($value): bool => $value !== null && $value !== ''));
}
$start = (int) $pageStart;
$end = (int) $pageEnd;
if ($end < $start) {
$end = $start;
}
return range($start, $end);
}
}