proofdb/app/service/ArticleImportService.php
2026-05-01 23:40:14 +08:00

776 lines
26 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace app\service;
use RuntimeException;
use Symfony\Component\Uid\Ulid;
class ArticleImportService
{
private const DEFAULT_CHUNK_SIZE = 800;
private const DEFAULT_CHUNK_OVERLAP = 120;
private const MIN_CHUNK_SIZE = 100;
private const MAX_CHUNK_SIZE = 4000;
private const SENTENCE_BOUNDARY_PATTERN = '/(?<=[。!?;.!?;])\s*|\R+/u';
private const MARKDOWN_PAGE_PATTERN = '/<!--\s*DOCMASTER:PAGE\s+0*([0-9A-Za-z_-]+)\s*-->\s*(?:#+\s*Page\s+\S+\s*)?(.*?)(?=\n---\s*\n\s*<!--\s*DOCMASTER:PAGE|\z)/su';
public function import(array $payload): array
{
$payload = $this->normalizePayload($payload);
$payload = $this->applyMetadataFallbacks($payload);
$errors = $this->validate($payload);
if ($errors !== []) {
return ['ok' => false, 'errors' => $errors];
}
$archiveUid = $this->archiveUid($payload);
$archive = $this->archive($payload, $archiveUid);
$pageBlocks = $this->pageBlocks($payload);
$chunkSize = $this->intOption($payload, 'chunk_size', self::DEFAULT_CHUNK_SIZE);
$chunkOverlap = $this->intOption($payload, 'chunk_overlap', self::DEFAULT_CHUNK_OVERLAP);
$chunks = $this->chunksFromPages($archiveUid, $pageBlocks, $chunkSize, $chunkOverlap);
$pages = $this->pagesSummary($pageBlocks, $chunks);
$needsAiMetadata = (new ArchiveRepository())->archiveNeedsMetadata($archive);
return [
'ok' => true,
'data' => [
'import_uid' => $archiveUid,
'archive' => $archive,
'chunks' => $chunks,
'pages' => $pages,
'stats' => [
'page_count' => count($pages),
'page_block_count' => count($pageBlocks),
'chunk_count' => count($chunks),
'chunk_size' => $chunkSize,
'chunk_overlap' => $chunkOverlap,
],
'queue' => [
'ai_metadata_enqueued' => false,
'needs_ai_metadata' => $needsAiMetadata,
],
],
];
}
public function persistSnapshot(array $import): void
{
$directory = runtime_path('proofdb/imports');
if (!is_dir($directory) && !mkdir($directory, 0775, true) && !is_dir($directory)) {
throw new RuntimeException("Unable to create directory: {$directory}");
}
$path = $directory . DIRECTORY_SEPARATOR . $import['import_uid'] . '.json';
$json = json_encode($import, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_THROW_ON_ERROR);
if (file_put_contents($path, $json) === false) {
throw new RuntimeException("Unable to write import snapshot: {$path}");
}
}
private function validate(array $payload): array
{
$errors = [];
if (isset($payload['file_error'])) {
$errors['file'][] = 'uploaded file is invalid.';
}
if (!isset($payload['title']) || !is_string($payload['title']) || trim($payload['title']) === '') {
$errors['title'][] = 'title is required.';
}
if (!isset($payload['source']) || !is_string($payload['source']) || trim($payload['source']) === '') {
$errors['source'][] = 'source is required.';
}
if (!isset($payload['content']) && !isset($payload['paragraphs']) && !isset($payload['pages'])) {
$errors['content'][] = 'content, file, pages, or paragraphs is required.';
}
if (isset($payload['paragraphs']) && !is_array($payload['paragraphs'])) {
$errors['paragraphs'][] = 'paragraphs must be an array.';
}
if (isset($payload['pages']) && !is_array($payload['pages'])) {
$errors['pages'][] = 'pages must be an array.';
}
if (isset($payload['metadata']) && !is_array($payload['metadata'])) {
$errors['metadata'][] = 'metadata must be an object.';
}
$chunkSize = $this->intOption($payload, 'chunk_size', self::DEFAULT_CHUNK_SIZE);
if ($chunkSize < self::MIN_CHUNK_SIZE || $chunkSize > self::MAX_CHUNK_SIZE) {
$errors['chunk_size'][] = 'chunk_size must be between 100 and 4000.';
}
$chunkOverlap = $this->intOption($payload, 'chunk_overlap', self::DEFAULT_CHUNK_OVERLAP);
if ($chunkOverlap < 0 || $chunkOverlap >= $chunkSize) {
$errors['chunk_overlap'][] = 'chunk_overlap must be greater than or equal to 0 and less than chunk_size.';
}
if (!isset($errors['paragraphs']) && isset($payload['paragraphs'])) {
$hasContent = false;
foreach ($payload['paragraphs'] as $index => $paragraph) {
$content = is_array($paragraph) ? ($paragraph['content'] ?? '') : $paragraph;
if (!is_string($content)) {
$errors["paragraphs.{$index}.content"][] = 'paragraph content must be a string.';
continue;
}
if (trim($content) !== '') {
$hasContent = true;
}
}
if (!$hasContent) {
$errors['paragraphs'][] = 'paragraphs must contain at least one non-empty paragraph.';
}
}
if (!isset($errors['pages']) && isset($payload['pages'])) {
$hasContent = false;
foreach ($payload['pages'] as $index => $page) {
if (!is_array($page)) {
$errors["pages.{$index}"][] = 'page must be an object.';
continue;
}
if (!$this->hasPageNumber($page)) {
$errors["pages.{$index}.page_number"][] = 'page_number is required.';
}
if (!isset($page['content']) || !is_string($page['content'])) {
$errors["pages.{$index}.content"][] = 'page content must be a string.';
continue;
}
if (trim($page['content']) !== '') {
$hasContent = true;
}
if (isset($page['metadata']) && !is_array($page['metadata'])) {
$errors["pages.{$index}.metadata"][] = 'page metadata must be an object.';
}
}
if (!$hasContent) {
$errors['pages'][] = 'pages must contain at least one non-empty page.';
}
}
if (isset($payload['content']) && (!is_string($payload['content']) || trim($payload['content']) === '')) {
$errors['content'][] = 'content must be a non-empty string.';
}
return $errors;
}
private function archive(array $payload, string $archiveUid): array
{
$title = $this->clean($payload['title']);
$source = $this->clean($payload['source']);
return [
'archive_uid' => $archiveUid,
'title' => $title,
'year' => isset($payload['year']) ? (int) $payload['year'] : null,
'author' => $this->nullableClean($payload['author'] ?? null),
'source' => $source,
'series' => $this->nullableClean($payload['series'] ?? null),
'tags' => is_array($payload['tags'] ?? null) ? array_values($payload['tags']) : [],
'summary' => $this->nullableClean($payload['summary'] ?? null),
'metadata' => $payload['metadata'] ?? [],
'content' => $this->nullableClean($payload['content_url'] ?? $payload['content_path'] ?? null),
'raw' => $this->nullableClean($payload['raw_url'] ?? $payload['raw_path'] ?? null),
];
}
private function pageBlocks(array $payload): array
{
if (isset($payload['pages'])) {
return $this->pageBlocksFromPages($payload);
}
if (isset($payload['paragraphs'])) {
return $this->pageBlocksFromItems($payload, $payload['paragraphs']);
}
return $this->pageBlocksFromItems($payload, preg_split('/\R{2,}/u', $payload['content']));
}
private function pageBlocksFromPages(array $payload): array
{
$pageBlocks = [];
foreach ($payload['pages'] as $pageIndex => $page) {
$pageNumber = $this->pageNumber($page);
$pageMetadata = $page['metadata'] ?? [];
$items = $this->markdownBlocksFromPage($page['content']);
foreach ($items as $itemIndex => $content) {
$pageBlock = $this->pageBlock($payload, $content, count($pageBlocks), $itemIndex, $pageNumber, [
'page_index' => $pageIndex,
'page_metadata' => $pageMetadata,
]);
if ($pageBlock !== null) {
$pageBlocks[] = $pageBlock;
}
}
}
return $pageBlocks;
}
private function chunksFromPages(string $archiveUid, array $pageBlocks, int $chunkSize, int $chunkOverlap): array
{
$chunks = [];
$chunkIndex = 1;
foreach ($this->groupBlocksByPage($pageBlocks) as $pageNumber => $blocks) {
$units = [];
foreach ($blocks as $block) {
$unit = $this->cleanEmbeddingText($block['content']);
if ($unit === '' || $this->isNoiseBlock($unit)) {
continue;
}
$units[] = $unit;
}
foreach ($this->packUnitsForEmbedding($units, $chunkSize, $chunkOverlap) as $text) {
$page = $this->restorePageNumber($pageNumber);
$chunks[] = $this->chunk($archiveUid, $chunkIndex, $page, $text);
$chunkIndex++;
}
}
return $chunks;
}
private function groupBlocksByPage(array $pageBlocks): array
{
$pages = [];
foreach ($pageBlocks as $block) {
$key = $block['page_number'] === null ? '' : (string) $block['page_number'];
$pages[$key][] = $block;
}
return $pages;
}
private function pagesSummary(array $pageBlocks, array $chunks): array
{
$pages = [];
foreach ($this->groupBlocksByPage($pageBlocks) as $pageNumber => $blocks) {
$page = $this->restorePageNumber($pageNumber);
$pageChunks = array_values(array_filter($chunks, fn (array $chunk): bool => $chunk['page_start'] === $page));
$contentLength = array_sum(array_map(fn (array $block): int => mb_strlen($block['content']), $blocks));
$pages[] = [
'page_number' => $page,
'block_count' => count($blocks),
'chunk_count' => count($pageChunks),
'content_length' => $contentLength,
'chunk_uids' => array_column($pageChunks, 'chunk_uid'),
];
}
return $pages;
}
private function packUnitsForEmbedding(array $units, int $chunkSize, int $chunkOverlap): array
{
$chunks = [];
$current = '';
foreach ($units as $unit) {
$unit = $this->clean($unit);
if ($unit === '') {
continue;
}
if (mb_strlen($unit) > $chunkSize) {
if ($current !== '') {
$chunks[] = $current;
$current = '';
}
array_push($chunks, ...$this->chunkLongUnit($unit, $chunkSize, $chunkOverlap));
continue;
}
$candidate = $current === '' ? $unit : $current . "\n\n" . $unit;
if (mb_strlen($candidate) <= $chunkSize) {
$current = $candidate;
continue;
}
if ($current !== '') {
$chunks[] = $current;
}
$current = $unit;
}
if ($current !== '') {
$chunks[] = $current;
}
return $chunks;
}
private function chunk(string $archiveUid, int $chunkIndex, int|string|null $pageNumber, string $text): array
{
$chunkUid = $this->chunkUid($archiveUid, $chunkIndex, implode('|', [
$chunkIndex,
(string) ($pageNumber ?? ''),
$text,
]));
return [
'chunk_uid' => $chunkUid,
'chunk_index' => $chunkIndex,
'page_start' => $pageNumber,
'page_end' => $pageNumber,
'pages' => $pageNumber === null ? [] : [$pageNumber],
'text' => $text,
'length' => mb_strlen($text),
'embedding_ref' => null,
];
}
private function normalizePayload(array $payload): array
{
if (isset($payload['content']) && is_string($payload['content']) && !isset($payload['pages']) && !isset($payload['paragraphs'])) {
$payload['pages'] = $this->pagesFromMarkdown($payload['content']);
}
if (!isset($payload['source']) || trim((string) $payload['source']) === '') {
$payload['source'] = 'raw-markdown';
}
return $payload;
}
private function applyMetadataFallbacks(array $payload): array
{
if ((!isset($payload['title']) || trim((string) $payload['title']) === '') && isset($payload['content']) && is_string($payload['content'])) {
$payload['title'] = $this->inferTitle($payload['content'], (string) ($payload['source'] ?? ''));
$payload['metadata'] = is_array($payload['metadata'] ?? null) ? $payload['metadata'] : [];
$payload['metadata']['title_source'] = 'fallback';
}
if (isset($payload['tags']) && is_string($payload['tags'])) {
$payload['tags'] = $this->tagsFromString($payload['tags']);
}
$payload['tags'] = is_array($payload['tags'] ?? null) ? $payload['tags'] : [];
if (isset($payload['year']) && is_numeric($payload['year'])) {
$payload['year'] = (int) $payload['year'];
}
$payload['metadata'] = is_array($payload['metadata'] ?? null) ? $payload['metadata'] : [];
return $payload;
}
private function pagesFromMarkdown(string $markdown): array
{
preg_match_all(self::MARKDOWN_PAGE_PATTERN, $markdown, $matches, PREG_SET_ORDER);
if ($matches === []) {
return [[
'page_number' => 1,
'content' => $this->cleanMarkdownPage($markdown),
'metadata' => ['parser' => 'markdown_single_page'],
]];
}
$pages = [];
foreach ($matches as $index => $match) {
$pageNumber = ctype_digit($match[1]) ? (int) $match[1] : $match[1];
$pages[] = [
'page_number' => $pageNumber,
'content' => $this->cleanMarkdownPage($match[2]),
'metadata' => [
'parser' => 'docmaster_markdown',
'page_index' => $index,
],
];
}
return $pages;
}
private function markdownBlocksFromPage(string $content): array
{
$content = $this->cleanMarkdownPage($content);
$blocks = preg_split('/\R{2,}/u', $content, -1, PREG_SPLIT_NO_EMPTY);
if ($blocks === false) {
return [$content];
}
$records = [];
foreach ($blocks as $block) {
$block = $this->cleanMarkdownBlock($block);
if ($block === '') {
continue;
}
$lastIndex = count($records) - 1;
if ($lastIndex >= 0 && $this->isCommentBlock($block) && $this->isPolicyRecordBlock($records[$lastIndex])) {
$records[$lastIndex] .= "\n" . $block;
continue;
}
$records[] = $block;
}
return $records;
}
private function cleanMarkdownPage(string $content): string
{
$content = preg_replace('/<!--\s*DOCMASTER:PAGE\s+[^>]+-->/iu', '', $content) ?? $content;
$content = preg_replace('/^\s*#+\s*Page\s+\S+\s*$/imu', '', $content) ?? $content;
$content = preg_replace('/^\s*---+\s*$/mu', '', $content) ?? $content;
$content = html_entity_decode($content, ENT_QUOTES | ENT_HTML5, 'UTF-8');
return trim($content);
}
private function cleanMarkdownBlock(string $block): string
{
$block = preg_replace('/[ \t]+/u', ' ', $block) ?? $block;
$block = preg_replace('/\R[ \t]+/u', "\n", $block) ?? $block;
return trim($block);
}
private function isCommentBlock(string $block): bool
{
return (bool) preg_match('/^\s*(?:[*#\s_~`>-])*COMMENT\b/iu', $this->plainBlock($block));
}
private function isPolicyRecordBlock(string $block): bool
{
return (bool) preg_match('/^\s*(?:NSAM|NSDM|NSDD|NSD|PD)\s+\d+/iu', $this->plainBlock($block));
}
private function isNoiseBlock(string $block): bool
{
$plain = strtoupper($this->plainBlock($block));
$plain = preg_replace('/\s+/u', ' ', $plain) ?? $plain;
if ($plain === '') {
return true;
}
if (preg_match('/^\d{1,6}$/', $plain)) {
return true;
}
$patterns = [
'/^(?:# )?UNCLASSIFIED$/',
'/^TOP SECRET$/',
'/^UNCLASSIFIED WITH TOP SECRET ATTACHMENTS$/',
'/^DECLASSIFY ON: OADR$/',
'/^\*? ?UNCLASSIFIED\*?$/',
];
foreach ($patterns as $pattern) {
if (preg_match($pattern, $plain)) {
return true;
}
}
return false;
}
private function cleanEmbeddingText(string $text): string
{
$lines = preg_split('/\R/u', $text);
if ($lines === false) {
return $this->cleanMarkdownBlock($text);
}
$kept = [];
foreach ($lines as $line) {
$line = trim($line);
if ($line === '' || $this->isNoiseLine($line)) {
continue;
}
$kept[] = $line;
}
return $this->cleanMarkdownBlock(implode("\n", $kept));
}
private function isNoiseLine(string $line): bool
{
$plain = strtoupper($this->plainBlock($line));
$plain = preg_replace('/\s+/u', ' ', $plain) ?? $plain;
if ($plain === '' || preg_match('/^\d{1,6}$/', $plain)) {
return true;
}
$patterns = [
'/^(?:# )?UNCLASSIFIED$/',
'/^TOP SECRET$/',
'/^UNCLASSIFIED WITH TOP SECRET ATTACHMENTS$/',
'/^DECLASSIFY ON: OADR$/',
'/^PARTIALLY DECLASSIFIED\/RELEASED ON .+$/',
'/^UNDER PROVISIONS OF .+$/',
'/^BY .+ NATIONAL SECURITY COUNCIL$/',
'/^F \d{2}-\d+$/',
];
foreach ($patterns as $pattern) {
if (preg_match($pattern, $plain)) {
return true;
}
}
return false;
}
private function plainBlock(string $block): string
{
$block = str_replace(['*', '_', '`', '~'], '', $block);
$block = preg_replace('/\s+/u', ' ', $block) ?? $block;
return trim($block);
}
private function inferTitle(string $markdown, string $source): string
{
if (preg_match_all('/^\s*#+\s*(.+?)\s*$/imu', $markdown, $matches)) {
foreach ($matches[1] as $heading) {
$heading = $this->clean($heading);
if (!preg_match('/^Page\s+\S+$/iu', $heading) && !$this->isNoiseLine($heading)) {
return $heading;
}
}
}
if ($source !== '') {
return pathinfo($source, PATHINFO_FILENAME) ?: $source;
}
return 'Untitled Markdown Import';
}
private function pageBlocksFromItems(array $payload, array $items): array
{
$pageBlocks = [];
foreach ($items as $index => $item) {
$content = is_array($item) ? ($item['content'] ?? '') : $item;
$pageNumber = is_array($item) && $this->hasPageNumber($item) ? $this->pageNumber($item) : null;
$metadata = is_array($item) ? ($item['metadata'] ?? []) : [];
$pageBlock = $this->pageBlock($payload, $content, count($pageBlocks), $index, $pageNumber, $metadata);
if ($pageBlock !== null) {
$pageBlocks[] = $pageBlock;
}
}
return $pageBlocks;
}
private function pageBlock(
array $payload,
mixed $content,
int $blockIndex,
int $sourceIndex,
int|string|null $pageNumber,
array $metadata
): ?array {
$content = $this->clean((string) $content);
if ($content === '') {
return null;
}
return [
'block_uid' => $this->uid('block', implode('|', [
$payload['source'],
$payload['title'],
(string) $pageNumber,
$sourceIndex,
$content,
])),
'index' => $blockIndex,
'page_number' => $pageNumber,
'content' => $content,
'metadata' => $metadata,
];
}
private function chunkLongUnit(string $text, int $chunkSize, int $chunkOverlap): array
{
if (mb_strlen($text) <= $chunkSize) {
return [$text];
}
$chunks = [];
$current = '';
foreach ($this->semanticUnits($text) as $unit) {
if (mb_strlen($unit) > $chunkSize) {
if ($current !== '') {
$chunks[] = $current;
$current = '';
}
array_push($chunks, ...$this->hardChunk($unit, $chunkSize, $chunkOverlap));
continue;
}
$candidate = $current === '' ? $unit : $current . ' ' . $unit;
if (mb_strlen($candidate) <= $chunkSize) {
$current = $candidate;
continue;
}
if ($current !== '') {
$chunks[] = $current;
}
$current = $unit;
}
if ($current !== '') {
$chunks[] = $current;
}
return $chunks === [] ? [$text] : $chunks;
}
private function semanticUnits(string $text): array
{
$units = preg_split(self::SENTENCE_BOUNDARY_PATTERN, $text, -1, PREG_SPLIT_NO_EMPTY);
if ($units === false || $units === []) {
return [$text];
}
return array_values(array_filter(array_map(fn (string $unit): string => $this->clean($unit), $units)));
}
private function hardChunk(string $text, int $chunkSize, int $chunkOverlap): array
{
$length = mb_strlen($text);
if ($length <= $chunkSize) {
return [$text];
}
$chunks = [];
$start = 0;
while ($start < $length) {
$chunk = mb_substr($text, $start, $chunkSize);
if ($chunk === '') {
break;
}
$chunks[] = $chunk;
if ($start + $chunkSize >= $length) {
break;
}
$start += $chunkSize - $chunkOverlap;
}
return $chunks;
}
private function hasPageNumber(array $item): bool
{
return array_key_exists('page_number', $item)
|| array_key_exists('page', $item)
|| array_key_exists('number', $item);
}
private function pageNumber(array $item): int|string
{
$pageNumber = $item['page_number'] ?? $item['page'] ?? $item['number'];
return is_int($pageNumber) ? $pageNumber : $this->clean((string) $pageNumber);
}
private function restorePageNumber(string $pageNumber): int|string|null
{
if ($pageNumber === '') {
return null;
}
return ctype_digit($pageNumber) ? (int) $pageNumber : $pageNumber;
}
private function intOption(array $payload, string $key, int $default): int
{
if (!isset($payload[$key]) || $payload[$key] === '') {
return $default;
}
return (int) $payload[$key];
}
private function clean(string $value): string
{
return trim(preg_replace('/[ \t]+/u', ' ', $value) ?? $value);
}
private function nullableClean(mixed $value): ?string
{
if (!is_string($value)) {
return null;
}
$value = $this->clean($value);
return $value === '' ? null : $value;
}
private function tagsFromString(string $value): array
{
$value = trim($value);
if ($value === '') {
return [];
}
$decoded = json_decode($value, true);
if (is_array($decoded)) {
return array_values(array_filter(array_map('strval', $decoded)));
}
return array_values(array_filter(array_map('trim', preg_split('/[,]/u', $value) ?: [])));
}
private function archiveUid(array $payload): string
{
if (isset($payload['archive_uid']) && is_string($payload['archive_uid']) && $this->isUlid($payload['archive_uid'])) {
return strtoupper($payload['archive_uid']);
}
return (string) new Ulid();
}
private function chunkUid(string $archiveUid, int $chunkIndex, string $value): string
{
return $archiveUid . '_' . $chunkIndex . '_' . $this->shortUid($value);
}
private function shortUid(string $value): string
{
$number = hexdec(substr(hash('crc32b', $value), 0, 8)) % 100000;
return str_pad((string) $number, 5, '0', STR_PAD_LEFT);
}
private function isUlid(string $value): bool
{
return (bool) preg_match('/^[0-9A-HJKMNP-TV-Z]{26}$/', strtoupper($value));
}
private function uid(string $prefix, string $value): string
{
return $prefix . '_' . substr(hash('sha256', $value), 0, 24);
}
}