300 lines
11 KiB
PHP
300 lines
11 KiB
PHP
<?php
|
|
|
|
namespace app\service;
|
|
|
|
use support\Db;
|
|
|
|
class ArchiveRepository
|
|
{
|
|
public function saveImport(array $import): void
|
|
{
|
|
Db::transaction(function () use ($import): void {
|
|
$archive = $import['archive'];
|
|
$chunks = $import['chunks'];
|
|
$chunkUids = array_column($chunks, 'chunk_uid');
|
|
|
|
Db::table('archives')->updateOrInsert(
|
|
['archive_uid' => $archive['archive_uid']],
|
|
[
|
|
'title' => $archive['title'] ?? null,
|
|
'summary' => $archive['summary'] ?? null,
|
|
'year' => $archive['year'] ?? null,
|
|
'author' => $archive['author'] ?? null,
|
|
'source' => $archive['source'] ?? null,
|
|
'series' => $archive['series'] ?? null,
|
|
'tags' => json_encode($archive['tags'] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
|
|
'metadata' => json_encode($archive['metadata'] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
|
|
'content' => $archive['content'] ?? null,
|
|
'raw' => $archive['raw'] ?? null,
|
|
'chunks' => json_encode($chunkUids, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
|
|
]
|
|
);
|
|
|
|
Db::table('chunks')->where('archive_uid', $archive['archive_uid'])->delete();
|
|
foreach ($chunks as $chunk) {
|
|
Db::table('chunks')->insert([
|
|
'chunk_uid' => $chunk['chunk_uid'],
|
|
'archive_uid' => $archive['archive_uid'],
|
|
'chunk_index' => $chunk['chunk_index'],
|
|
'page_start' => $chunk['page_start'],
|
|
'page_end' => $chunk['page_end'],
|
|
'text' => $chunk['text'],
|
|
'length' => $chunk['length'],
|
|
'embedding_status' => 0,
|
|
'embedding_ref' => null,
|
|
'embedding_model' => null,
|
|
'embedding_error' => null,
|
|
'embedding_updated_at' => null,
|
|
'search_index_status' => 0,
|
|
'search_index_error' => null,
|
|
'search_index_updated_at' => null,
|
|
]);
|
|
}
|
|
});
|
|
}
|
|
|
|
public function findArchive(string $archiveUid): ?array
|
|
{
|
|
$archive = Db::table('archives')->where('archive_uid', $archiveUid)->first();
|
|
if (!$archive) {
|
|
return null;
|
|
}
|
|
|
|
return $this->archiveToArray($archive);
|
|
}
|
|
|
|
public function findChunksText(string $archiveUid, int $limit = 20): string
|
|
{
|
|
$chunks = Db::table('chunks')
|
|
->where('archive_uid', $archiveUid)
|
|
->orderBy('chunk_index')
|
|
->limit($limit)
|
|
->get(['text'])
|
|
->all();
|
|
|
|
return implode("\n\n", array_map(fn ($chunk): string => (string) $chunk->text, $chunks));
|
|
}
|
|
|
|
public function findChunk(string $chunkUid): ?array
|
|
{
|
|
$row = Db::table('chunks')
|
|
->join('archives', 'chunks.archive_uid', '=', 'archives.archive_uid')
|
|
->where('chunks.chunk_uid', $chunkUid)
|
|
->first([
|
|
'chunks.chunk_uid',
|
|
'chunks.archive_uid',
|
|
'chunks.chunk_index',
|
|
'chunks.page_start',
|
|
'chunks.page_end',
|
|
'chunks.text',
|
|
'chunks.length',
|
|
'chunks.embedding_status',
|
|
'chunks.embedding_ref',
|
|
'chunks.embedding_model',
|
|
'chunks.embedding_error',
|
|
'chunks.search_index_status',
|
|
'chunks.search_index_error',
|
|
'archives.title',
|
|
'archives.summary',
|
|
'archives.year',
|
|
'archives.author',
|
|
'archives.source',
|
|
'archives.series',
|
|
'archives.tags',
|
|
'archives.metadata',
|
|
]);
|
|
|
|
if (!$row) {
|
|
return null;
|
|
}
|
|
|
|
return [
|
|
'chunk_uid' => (string) $row->chunk_uid,
|
|
'archive_uid' => (string) $row->archive_uid,
|
|
'chunk_index' => (int) $row->chunk_index,
|
|
'page_start' => $row->page_start === null ? null : (int) $row->page_start,
|
|
'page_end' => $row->page_end === null ? null : (int) $row->page_end,
|
|
'pages' => $this->pages($row->page_start, $row->page_end),
|
|
'text' => (string) $row->text,
|
|
'length' => $row->length === null ? null : (int) $row->length,
|
|
'embedding_status' => (int) $row->embedding_status,
|
|
'embedding_ref' => $this->decodeJson($row->embedding_ref ?? null, null),
|
|
'embedding_model' => $row->embedding_model,
|
|
'embedding_error' => $row->embedding_error,
|
|
'search_index_status' => (int) $row->search_index_status,
|
|
'search_index_error' => $row->search_index_error,
|
|
'archive' => [
|
|
'archive_uid' => (string) $row->archive_uid,
|
|
'title' => $row->title,
|
|
'summary' => $row->summary,
|
|
'year' => $row->year === null ? null : (int) $row->year,
|
|
'author' => $row->author,
|
|
'source' => $row->source,
|
|
'series' => $row->series,
|
|
'tags' => $this->decodeJson($row->tags ?? null, []),
|
|
'metadata' => $this->decodeJson($row->metadata ?? null, []),
|
|
],
|
|
];
|
|
}
|
|
|
|
public function findArchiveChunks(string $archiveUid): array
|
|
{
|
|
$rows = Db::table('chunks')
|
|
->join('archives', 'chunks.archive_uid', '=', 'archives.archive_uid')
|
|
->where('chunks.archive_uid', $archiveUid)
|
|
->orderBy('chunks.chunk_index')
|
|
->get([
|
|
'chunks.chunk_uid',
|
|
'chunks.archive_uid',
|
|
'chunks.chunk_index',
|
|
'chunks.page_start',
|
|
'chunks.page_end',
|
|
'chunks.text',
|
|
'chunks.length',
|
|
'chunks.embedding_status',
|
|
'chunks.embedding_ref',
|
|
'chunks.embedding_model',
|
|
'chunks.embedding_error',
|
|
'chunks.search_index_status',
|
|
'chunks.search_index_error',
|
|
'archives.title',
|
|
'archives.summary',
|
|
'archives.year',
|
|
'archives.author',
|
|
'archives.source',
|
|
'archives.series',
|
|
'archives.tags',
|
|
'archives.metadata',
|
|
])
|
|
->all();
|
|
|
|
return array_map(fn (object $row): array => $this->chunkRowToArray($row), $rows);
|
|
}
|
|
|
|
public function updateMetadata(string $archiveUid, array $fields, array $aiMeta): void
|
|
{
|
|
$archive = $this->findArchive($archiveUid);
|
|
$metadata = $archive['metadata'] ?? [];
|
|
$metadata['ai_enrichment'] = $aiMeta;
|
|
|
|
$updates = [
|
|
'metadata' => json_encode($metadata, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
|
|
];
|
|
|
|
foreach (['title', 'summary', 'year', 'author', 'series', 'tags'] as $field) {
|
|
if (!array_key_exists($field, $fields)) {
|
|
continue;
|
|
}
|
|
|
|
$updates[$field] = $field === 'tags'
|
|
? json_encode($fields[$field] ?? [], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES)
|
|
: $fields[$field];
|
|
}
|
|
|
|
Db::table('archives')->where('archive_uid', $archiveUid)->update($updates);
|
|
}
|
|
|
|
public function archiveNeedsMetadata(array $archive): bool
|
|
{
|
|
foreach (['title', 'year', 'author', 'tags', 'summary'] as $field) {
|
|
$value = $archive[$field] ?? null;
|
|
if ($field === 'title' && (($archive['metadata']['title_source'] ?? null) === 'fallback')) {
|
|
return true;
|
|
}
|
|
if (is_array($value) && $value === []) {
|
|
return true;
|
|
}
|
|
if ($field === 'year' && (!$value || (int) $value <= 0)) {
|
|
return true;
|
|
}
|
|
if (!is_array($value) && ($value === null || trim((string) $value) === '')) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
private function archiveToArray(object $archive): array
|
|
{
|
|
return [
|
|
'archive_uid' => $archive->archive_uid,
|
|
'title' => $archive->title,
|
|
'summary' => $archive->summary,
|
|
'year' => $archive->year,
|
|
'author' => $archive->author,
|
|
'source' => $archive->source,
|
|
'series' => $archive->series,
|
|
'tags' => json_decode($archive->tags ?? '[]', true) ?: [],
|
|
'metadata' => json_decode($archive->metadata ?? '{}', true) ?: [],
|
|
'content' => $archive->content,
|
|
'raw' => $archive->raw,
|
|
'chunks' => json_decode($archive->chunks ?? '[]', true) ?: [],
|
|
];
|
|
}
|
|
|
|
private function chunkRowToArray(object $row): array
|
|
{
|
|
return [
|
|
'chunk_uid' => (string) $row->chunk_uid,
|
|
'archive_uid' => (string) $row->archive_uid,
|
|
'chunk_index' => (int) $row->chunk_index,
|
|
'page_start' => $row->page_start === null ? null : (int) $row->page_start,
|
|
'page_end' => $row->page_end === null ? null : (int) $row->page_end,
|
|
'pages' => $this->pages($row->page_start, $row->page_end),
|
|
'text' => (string) $row->text,
|
|
'length' => $row->length === null ? null : (int) $row->length,
|
|
'embedding_status' => (int) $row->embedding_status,
|
|
'embedding_ref' => $this->decodeJson($row->embedding_ref ?? null, null),
|
|
'embedding_model' => $row->embedding_model,
|
|
'embedding_error' => $row->embedding_error,
|
|
'search_index_status' => (int) $row->search_index_status,
|
|
'search_index_error' => $row->search_index_error,
|
|
'archive' => [
|
|
'archive_uid' => (string) $row->archive_uid,
|
|
'title' => $row->title,
|
|
'summary' => $row->summary,
|
|
'year' => $row->year === null ? null : (int) $row->year,
|
|
'author' => $row->author,
|
|
'source' => $row->source,
|
|
'series' => $row->series,
|
|
'tags' => $this->decodeJson($row->tags ?? null, []),
|
|
'metadata' => $this->decodeJson($row->metadata ?? null, []),
|
|
],
|
|
];
|
|
}
|
|
|
|
private function decodeJson(mixed $value, mixed $fallback): mixed
|
|
{
|
|
if ($value === null) {
|
|
return $fallback;
|
|
}
|
|
|
|
if (is_array($value)) {
|
|
return $value;
|
|
}
|
|
|
|
if (!is_string($value) || trim($value) === '') {
|
|
return $fallback;
|
|
}
|
|
|
|
$decoded = json_decode($value, true);
|
|
return $decoded === null && json_last_error() !== JSON_ERROR_NONE ? $fallback : $decoded;
|
|
}
|
|
|
|
private function pages(mixed $pageStart, mixed $pageEnd): array
|
|
{
|
|
if (!is_numeric($pageStart) || !is_numeric($pageEnd)) {
|
|
return array_values(array_filter([$pageStart, $pageEnd], static fn ($value): bool => $value !== null && $value !== ''));
|
|
}
|
|
|
|
$start = (int) $pageStart;
|
|
$end = (int) $pageEnd;
|
|
if ($end < $start) {
|
|
$end = $start;
|
|
}
|
|
|
|
return range($start, $end);
|
|
}
|
|
}
|