776 lines
26 KiB
PHP
776 lines
26 KiB
PHP
<?php
|
||
|
||
namespace app\service;
|
||
|
||
use RuntimeException;
|
||
use Symfony\Component\Uid\Ulid;
|
||
|
||
class ArticleImportService
|
||
{
|
||
private const DEFAULT_CHUNK_SIZE = 800;
|
||
private const DEFAULT_CHUNK_OVERLAP = 120;
|
||
private const MIN_CHUNK_SIZE = 100;
|
||
private const MAX_CHUNK_SIZE = 4000;
|
||
private const SENTENCE_BOUNDARY_PATTERN = '/(?<=[。!?;.!?;])\s*|\R+/u';
|
||
private const MARKDOWN_PAGE_PATTERN = '/<!--\s*DOCMASTER:PAGE\s+0*([0-9A-Za-z_-]+)\s*-->\s*(?:#+\s*Page\s+\S+\s*)?(.*?)(?=\n---\s*\n\s*<!--\s*DOCMASTER:PAGE|\z)/su';
|
||
|
||
public function import(array $payload): array
|
||
{
|
||
$payload = $this->normalizePayload($payload);
|
||
$payload = $this->applyMetadataFallbacks($payload);
|
||
$errors = $this->validate($payload);
|
||
if ($errors !== []) {
|
||
return ['ok' => false, 'errors' => $errors];
|
||
}
|
||
|
||
$archiveUid = $this->archiveUid($payload);
|
||
$archive = $this->archive($payload, $archiveUid);
|
||
$pageBlocks = $this->pageBlocks($payload);
|
||
$chunkSize = $this->intOption($payload, 'chunk_size', self::DEFAULT_CHUNK_SIZE);
|
||
$chunkOverlap = $this->intOption($payload, 'chunk_overlap', self::DEFAULT_CHUNK_OVERLAP);
|
||
|
||
$chunks = $this->chunksFromPages($archiveUid, $pageBlocks, $chunkSize, $chunkOverlap);
|
||
$pages = $this->pagesSummary($pageBlocks, $chunks);
|
||
$needsAiMetadata = (new ArchiveRepository())->archiveNeedsMetadata($archive);
|
||
|
||
return [
|
||
'ok' => true,
|
||
'data' => [
|
||
'import_uid' => $archiveUid,
|
||
'archive' => $archive,
|
||
'chunks' => $chunks,
|
||
'pages' => $pages,
|
||
'stats' => [
|
||
'page_count' => count($pages),
|
||
'page_block_count' => count($pageBlocks),
|
||
'chunk_count' => count($chunks),
|
||
'chunk_size' => $chunkSize,
|
||
'chunk_overlap' => $chunkOverlap,
|
||
],
|
||
'queue' => [
|
||
'ai_metadata_enqueued' => false,
|
||
'needs_ai_metadata' => $needsAiMetadata,
|
||
],
|
||
],
|
||
];
|
||
}
|
||
|
||
public function persistSnapshot(array $import): void
|
||
{
|
||
$directory = runtime_path('proofdb/imports');
|
||
if (!is_dir($directory) && !mkdir($directory, 0775, true) && !is_dir($directory)) {
|
||
throw new RuntimeException("Unable to create directory: {$directory}");
|
||
}
|
||
|
||
$path = $directory . DIRECTORY_SEPARATOR . $import['import_uid'] . '.json';
|
||
$json = json_encode($import, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_THROW_ON_ERROR);
|
||
|
||
if (file_put_contents($path, $json) === false) {
|
||
throw new RuntimeException("Unable to write import snapshot: {$path}");
|
||
}
|
||
}
|
||
|
||
private function validate(array $payload): array
|
||
{
|
||
$errors = [];
|
||
|
||
if (isset($payload['file_error'])) {
|
||
$errors['file'][] = 'uploaded file is invalid.';
|
||
}
|
||
|
||
if (!isset($payload['title']) || !is_string($payload['title']) || trim($payload['title']) === '') {
|
||
$errors['title'][] = 'title is required.';
|
||
}
|
||
|
||
if (!isset($payload['source']) || !is_string($payload['source']) || trim($payload['source']) === '') {
|
||
$errors['source'][] = 'source is required.';
|
||
}
|
||
|
||
if (!isset($payload['content']) && !isset($payload['paragraphs']) && !isset($payload['pages'])) {
|
||
$errors['content'][] = 'content, file, pages, or paragraphs is required.';
|
||
}
|
||
|
||
if (isset($payload['paragraphs']) && !is_array($payload['paragraphs'])) {
|
||
$errors['paragraphs'][] = 'paragraphs must be an array.';
|
||
}
|
||
|
||
if (isset($payload['pages']) && !is_array($payload['pages'])) {
|
||
$errors['pages'][] = 'pages must be an array.';
|
||
}
|
||
|
||
if (isset($payload['metadata']) && !is_array($payload['metadata'])) {
|
||
$errors['metadata'][] = 'metadata must be an object.';
|
||
}
|
||
|
||
$chunkSize = $this->intOption($payload, 'chunk_size', self::DEFAULT_CHUNK_SIZE);
|
||
if ($chunkSize < self::MIN_CHUNK_SIZE || $chunkSize > self::MAX_CHUNK_SIZE) {
|
||
$errors['chunk_size'][] = 'chunk_size must be between 100 and 4000.';
|
||
}
|
||
|
||
$chunkOverlap = $this->intOption($payload, 'chunk_overlap', self::DEFAULT_CHUNK_OVERLAP);
|
||
if ($chunkOverlap < 0 || $chunkOverlap >= $chunkSize) {
|
||
$errors['chunk_overlap'][] = 'chunk_overlap must be greater than or equal to 0 and less than chunk_size.';
|
||
}
|
||
|
||
if (!isset($errors['paragraphs']) && isset($payload['paragraphs'])) {
|
||
$hasContent = false;
|
||
foreach ($payload['paragraphs'] as $index => $paragraph) {
|
||
$content = is_array($paragraph) ? ($paragraph['content'] ?? '') : $paragraph;
|
||
if (!is_string($content)) {
|
||
$errors["paragraphs.{$index}.content"][] = 'paragraph content must be a string.';
|
||
continue;
|
||
}
|
||
if (trim($content) !== '') {
|
||
$hasContent = true;
|
||
}
|
||
}
|
||
if (!$hasContent) {
|
||
$errors['paragraphs'][] = 'paragraphs must contain at least one non-empty paragraph.';
|
||
}
|
||
}
|
||
|
||
if (!isset($errors['pages']) && isset($payload['pages'])) {
|
||
$hasContent = false;
|
||
foreach ($payload['pages'] as $index => $page) {
|
||
if (!is_array($page)) {
|
||
$errors["pages.{$index}"][] = 'page must be an object.';
|
||
continue;
|
||
}
|
||
|
||
if (!$this->hasPageNumber($page)) {
|
||
$errors["pages.{$index}.page_number"][] = 'page_number is required.';
|
||
}
|
||
|
||
if (!isset($page['content']) || !is_string($page['content'])) {
|
||
$errors["pages.{$index}.content"][] = 'page content must be a string.';
|
||
continue;
|
||
}
|
||
|
||
if (trim($page['content']) !== '') {
|
||
$hasContent = true;
|
||
}
|
||
|
||
if (isset($page['metadata']) && !is_array($page['metadata'])) {
|
||
$errors["pages.{$index}.metadata"][] = 'page metadata must be an object.';
|
||
}
|
||
}
|
||
|
||
if (!$hasContent) {
|
||
$errors['pages'][] = 'pages must contain at least one non-empty page.';
|
||
}
|
||
}
|
||
|
||
if (isset($payload['content']) && (!is_string($payload['content']) || trim($payload['content']) === '')) {
|
||
$errors['content'][] = 'content must be a non-empty string.';
|
||
}
|
||
|
||
return $errors;
|
||
}
|
||
|
||
private function archive(array $payload, string $archiveUid): array
|
||
{
|
||
$title = $this->clean($payload['title']);
|
||
$source = $this->clean($payload['source']);
|
||
|
||
return [
|
||
'archive_uid' => $archiveUid,
|
||
'title' => $title,
|
||
'year' => isset($payload['year']) ? (int) $payload['year'] : null,
|
||
'author' => $this->nullableClean($payload['author'] ?? null),
|
||
'source' => $source,
|
||
'series' => $this->nullableClean($payload['series'] ?? null),
|
||
'tags' => is_array($payload['tags'] ?? null) ? array_values($payload['tags']) : [],
|
||
'summary' => $this->nullableClean($payload['summary'] ?? null),
|
||
'metadata' => $payload['metadata'] ?? [],
|
||
'content' => $this->nullableClean($payload['content_url'] ?? $payload['content_path'] ?? null),
|
||
'raw' => $this->nullableClean($payload['raw_url'] ?? $payload['raw_path'] ?? null),
|
||
];
|
||
}
|
||
|
||
private function pageBlocks(array $payload): array
|
||
{
|
||
if (isset($payload['pages'])) {
|
||
return $this->pageBlocksFromPages($payload);
|
||
}
|
||
|
||
if (isset($payload['paragraphs'])) {
|
||
return $this->pageBlocksFromItems($payload, $payload['paragraphs']);
|
||
}
|
||
|
||
return $this->pageBlocksFromItems($payload, preg_split('/\R{2,}/u', $payload['content']));
|
||
}
|
||
|
||
private function pageBlocksFromPages(array $payload): array
|
||
{
|
||
$pageBlocks = [];
|
||
|
||
foreach ($payload['pages'] as $pageIndex => $page) {
|
||
$pageNumber = $this->pageNumber($page);
|
||
$pageMetadata = $page['metadata'] ?? [];
|
||
$items = $this->markdownBlocksFromPage($page['content']);
|
||
|
||
foreach ($items as $itemIndex => $content) {
|
||
$pageBlock = $this->pageBlock($payload, $content, count($pageBlocks), $itemIndex, $pageNumber, [
|
||
'page_index' => $pageIndex,
|
||
'page_metadata' => $pageMetadata,
|
||
]);
|
||
|
||
if ($pageBlock !== null) {
|
||
$pageBlocks[] = $pageBlock;
|
||
}
|
||
}
|
||
}
|
||
|
||
return $pageBlocks;
|
||
}
|
||
|
||
private function chunksFromPages(string $archiveUid, array $pageBlocks, int $chunkSize, int $chunkOverlap): array
|
||
{
|
||
$chunks = [];
|
||
$chunkIndex = 1;
|
||
|
||
foreach ($this->groupBlocksByPage($pageBlocks) as $pageNumber => $blocks) {
|
||
$units = [];
|
||
foreach ($blocks as $block) {
|
||
$unit = $this->cleanEmbeddingText($block['content']);
|
||
if ($unit === '' || $this->isNoiseBlock($unit)) {
|
||
continue;
|
||
}
|
||
|
||
$units[] = $unit;
|
||
}
|
||
|
||
foreach ($this->packUnitsForEmbedding($units, $chunkSize, $chunkOverlap) as $text) {
|
||
$page = $this->restorePageNumber($pageNumber);
|
||
$chunks[] = $this->chunk($archiveUid, $chunkIndex, $page, $text);
|
||
$chunkIndex++;
|
||
}
|
||
}
|
||
|
||
return $chunks;
|
||
}
|
||
|
||
private function groupBlocksByPage(array $pageBlocks): array
|
||
{
|
||
$pages = [];
|
||
foreach ($pageBlocks as $block) {
|
||
$key = $block['page_number'] === null ? '' : (string) $block['page_number'];
|
||
$pages[$key][] = $block;
|
||
}
|
||
|
||
return $pages;
|
||
}
|
||
|
||
private function pagesSummary(array $pageBlocks, array $chunks): array
|
||
{
|
||
$pages = [];
|
||
|
||
foreach ($this->groupBlocksByPage($pageBlocks) as $pageNumber => $blocks) {
|
||
$page = $this->restorePageNumber($pageNumber);
|
||
$pageChunks = array_values(array_filter($chunks, fn (array $chunk): bool => $chunk['page_start'] === $page));
|
||
$contentLength = array_sum(array_map(fn (array $block): int => mb_strlen($block['content']), $blocks));
|
||
|
||
$pages[] = [
|
||
'page_number' => $page,
|
||
'block_count' => count($blocks),
|
||
'chunk_count' => count($pageChunks),
|
||
'content_length' => $contentLength,
|
||
'chunk_uids' => array_column($pageChunks, 'chunk_uid'),
|
||
];
|
||
}
|
||
|
||
return $pages;
|
||
}
|
||
|
||
private function packUnitsForEmbedding(array $units, int $chunkSize, int $chunkOverlap): array
|
||
{
|
||
$chunks = [];
|
||
$current = '';
|
||
|
||
foreach ($units as $unit) {
|
||
$unit = $this->clean($unit);
|
||
if ($unit === '') {
|
||
continue;
|
||
}
|
||
|
||
if (mb_strlen($unit) > $chunkSize) {
|
||
if ($current !== '') {
|
||
$chunks[] = $current;
|
||
$current = '';
|
||
}
|
||
|
||
array_push($chunks, ...$this->chunkLongUnit($unit, $chunkSize, $chunkOverlap));
|
||
continue;
|
||
}
|
||
|
||
$candidate = $current === '' ? $unit : $current . "\n\n" . $unit;
|
||
if (mb_strlen($candidate) <= $chunkSize) {
|
||
$current = $candidate;
|
||
continue;
|
||
}
|
||
|
||
if ($current !== '') {
|
||
$chunks[] = $current;
|
||
}
|
||
$current = $unit;
|
||
}
|
||
|
||
if ($current !== '') {
|
||
$chunks[] = $current;
|
||
}
|
||
|
||
return $chunks;
|
||
}
|
||
|
||
private function chunk(string $archiveUid, int $chunkIndex, int|string|null $pageNumber, string $text): array
|
||
{
|
||
$chunkUid = $this->chunkUid($archiveUid, $chunkIndex, implode('|', [
|
||
$chunkIndex,
|
||
(string) ($pageNumber ?? ''),
|
||
$text,
|
||
]));
|
||
|
||
return [
|
||
'chunk_uid' => $chunkUid,
|
||
'chunk_index' => $chunkIndex,
|
||
'page_start' => $pageNumber,
|
||
'page_end' => $pageNumber,
|
||
'pages' => $pageNumber === null ? [] : [$pageNumber],
|
||
'text' => $text,
|
||
'length' => mb_strlen($text),
|
||
'embedding_ref' => null,
|
||
];
|
||
}
|
||
|
||
private function normalizePayload(array $payload): array
|
||
{
|
||
if (isset($payload['content']) && is_string($payload['content']) && !isset($payload['pages']) && !isset($payload['paragraphs'])) {
|
||
$payload['pages'] = $this->pagesFromMarkdown($payload['content']);
|
||
}
|
||
|
||
if (!isset($payload['source']) || trim((string) $payload['source']) === '') {
|
||
$payload['source'] = 'raw-markdown';
|
||
}
|
||
|
||
return $payload;
|
||
}
|
||
|
||
private function applyMetadataFallbacks(array $payload): array
|
||
{
|
||
if ((!isset($payload['title']) || trim((string) $payload['title']) === '') && isset($payload['content']) && is_string($payload['content'])) {
|
||
$payload['title'] = $this->inferTitle($payload['content'], (string) ($payload['source'] ?? ''));
|
||
$payload['metadata'] = is_array($payload['metadata'] ?? null) ? $payload['metadata'] : [];
|
||
$payload['metadata']['title_source'] = 'fallback';
|
||
}
|
||
|
||
if (isset($payload['tags']) && is_string($payload['tags'])) {
|
||
$payload['tags'] = $this->tagsFromString($payload['tags']);
|
||
}
|
||
$payload['tags'] = is_array($payload['tags'] ?? null) ? $payload['tags'] : [];
|
||
|
||
if (isset($payload['year']) && is_numeric($payload['year'])) {
|
||
$payload['year'] = (int) $payload['year'];
|
||
}
|
||
|
||
$payload['metadata'] = is_array($payload['metadata'] ?? null) ? $payload['metadata'] : [];
|
||
|
||
return $payload;
|
||
}
|
||
|
||
private function pagesFromMarkdown(string $markdown): array
|
||
{
|
||
preg_match_all(self::MARKDOWN_PAGE_PATTERN, $markdown, $matches, PREG_SET_ORDER);
|
||
if ($matches === []) {
|
||
return [[
|
||
'page_number' => 1,
|
||
'content' => $this->cleanMarkdownPage($markdown),
|
||
'metadata' => ['parser' => 'markdown_single_page'],
|
||
]];
|
||
}
|
||
|
||
$pages = [];
|
||
foreach ($matches as $index => $match) {
|
||
$pageNumber = ctype_digit($match[1]) ? (int) $match[1] : $match[1];
|
||
$pages[] = [
|
||
'page_number' => $pageNumber,
|
||
'content' => $this->cleanMarkdownPage($match[2]),
|
||
'metadata' => [
|
||
'parser' => 'docmaster_markdown',
|
||
'page_index' => $index,
|
||
],
|
||
];
|
||
}
|
||
|
||
return $pages;
|
||
}
|
||
|
||
private function markdownBlocksFromPage(string $content): array
|
||
{
|
||
$content = $this->cleanMarkdownPage($content);
|
||
$blocks = preg_split('/\R{2,}/u', $content, -1, PREG_SPLIT_NO_EMPTY);
|
||
if ($blocks === false) {
|
||
return [$content];
|
||
}
|
||
|
||
$records = [];
|
||
foreach ($blocks as $block) {
|
||
$block = $this->cleanMarkdownBlock($block);
|
||
if ($block === '') {
|
||
continue;
|
||
}
|
||
|
||
$lastIndex = count($records) - 1;
|
||
if ($lastIndex >= 0 && $this->isCommentBlock($block) && $this->isPolicyRecordBlock($records[$lastIndex])) {
|
||
$records[$lastIndex] .= "\n" . $block;
|
||
continue;
|
||
}
|
||
|
||
$records[] = $block;
|
||
}
|
||
|
||
return $records;
|
||
}
|
||
|
||
private function cleanMarkdownPage(string $content): string
|
||
{
|
||
$content = preg_replace('/<!--\s*DOCMASTER:PAGE\s+[^>]+-->/iu', '', $content) ?? $content;
|
||
$content = preg_replace('/^\s*#+\s*Page\s+\S+\s*$/imu', '', $content) ?? $content;
|
||
$content = preg_replace('/^\s*---+\s*$/mu', '', $content) ?? $content;
|
||
$content = html_entity_decode($content, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||
|
||
return trim($content);
|
||
}
|
||
|
||
private function cleanMarkdownBlock(string $block): string
|
||
{
|
||
$block = preg_replace('/[ \t]+/u', ' ', $block) ?? $block;
|
||
$block = preg_replace('/\R[ \t]+/u', "\n", $block) ?? $block;
|
||
return trim($block);
|
||
}
|
||
|
||
private function isCommentBlock(string $block): bool
|
||
{
|
||
return (bool) preg_match('/^\s*(?:[*#\s_~`>-])*COMMENT\b/iu', $this->plainBlock($block));
|
||
}
|
||
|
||
private function isPolicyRecordBlock(string $block): bool
|
||
{
|
||
return (bool) preg_match('/^\s*(?:NSAM|NSDM|NSDD|NSD|PD)\s+\d+/iu', $this->plainBlock($block));
|
||
}
|
||
|
||
private function isNoiseBlock(string $block): bool
|
||
{
|
||
$plain = strtoupper($this->plainBlock($block));
|
||
$plain = preg_replace('/\s+/u', ' ', $plain) ?? $plain;
|
||
|
||
if ($plain === '') {
|
||
return true;
|
||
}
|
||
|
||
if (preg_match('/^\d{1,6}$/', $plain)) {
|
||
return true;
|
||
}
|
||
|
||
$patterns = [
|
||
'/^(?:# )?UNCLASSIFIED$/',
|
||
'/^TOP SECRET$/',
|
||
'/^UNCLASSIFIED WITH TOP SECRET ATTACHMENTS$/',
|
||
'/^DECLASSIFY ON: OADR$/',
|
||
'/^\*? ?UNCLASSIFIED\*?$/',
|
||
];
|
||
|
||
foreach ($patterns as $pattern) {
|
||
if (preg_match($pattern, $plain)) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
private function cleanEmbeddingText(string $text): string
|
||
{
|
||
$lines = preg_split('/\R/u', $text);
|
||
if ($lines === false) {
|
||
return $this->cleanMarkdownBlock($text);
|
||
}
|
||
|
||
$kept = [];
|
||
foreach ($lines as $line) {
|
||
$line = trim($line);
|
||
if ($line === '' || $this->isNoiseLine($line)) {
|
||
continue;
|
||
}
|
||
$kept[] = $line;
|
||
}
|
||
|
||
return $this->cleanMarkdownBlock(implode("\n", $kept));
|
||
}
|
||
|
||
private function isNoiseLine(string $line): bool
|
||
{
|
||
$plain = strtoupper($this->plainBlock($line));
|
||
$plain = preg_replace('/\s+/u', ' ', $plain) ?? $plain;
|
||
|
||
if ($plain === '' || preg_match('/^\d{1,6}$/', $plain)) {
|
||
return true;
|
||
}
|
||
|
||
$patterns = [
|
||
'/^(?:# )?UNCLASSIFIED$/',
|
||
'/^TOP SECRET$/',
|
||
'/^UNCLASSIFIED WITH TOP SECRET ATTACHMENTS$/',
|
||
'/^DECLASSIFY ON: OADR$/',
|
||
'/^PARTIALLY DECLASSIFIED\/RELEASED ON .+$/',
|
||
'/^UNDER PROVISIONS OF .+$/',
|
||
'/^BY .+ NATIONAL SECURITY COUNCIL$/',
|
||
'/^F \d{2}-\d+$/',
|
||
];
|
||
|
||
foreach ($patterns as $pattern) {
|
||
if (preg_match($pattern, $plain)) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
private function plainBlock(string $block): string
|
||
{
|
||
$block = str_replace(['*', '_', '`', '~'], '', $block);
|
||
$block = preg_replace('/\s+/u', ' ', $block) ?? $block;
|
||
return trim($block);
|
||
}
|
||
|
||
private function inferTitle(string $markdown, string $source): string
|
||
{
|
||
if (preg_match_all('/^\s*#+\s*(.+?)\s*$/imu', $markdown, $matches)) {
|
||
foreach ($matches[1] as $heading) {
|
||
$heading = $this->clean($heading);
|
||
if (!preg_match('/^Page\s+\S+$/iu', $heading) && !$this->isNoiseLine($heading)) {
|
||
return $heading;
|
||
}
|
||
}
|
||
}
|
||
|
||
if ($source !== '') {
|
||
return pathinfo($source, PATHINFO_FILENAME) ?: $source;
|
||
}
|
||
|
||
return 'Untitled Markdown Import';
|
||
}
|
||
|
||
private function pageBlocksFromItems(array $payload, array $items): array
|
||
{
|
||
$pageBlocks = [];
|
||
|
||
foreach ($items as $index => $item) {
|
||
$content = is_array($item) ? ($item['content'] ?? '') : $item;
|
||
$pageNumber = is_array($item) && $this->hasPageNumber($item) ? $this->pageNumber($item) : null;
|
||
$metadata = is_array($item) ? ($item['metadata'] ?? []) : [];
|
||
$pageBlock = $this->pageBlock($payload, $content, count($pageBlocks), $index, $pageNumber, $metadata);
|
||
|
||
if ($pageBlock !== null) {
|
||
$pageBlocks[] = $pageBlock;
|
||
}
|
||
}
|
||
|
||
return $pageBlocks;
|
||
}
|
||
|
||
private function pageBlock(
|
||
array $payload,
|
||
mixed $content,
|
||
int $blockIndex,
|
||
int $sourceIndex,
|
||
int|string|null $pageNumber,
|
||
array $metadata
|
||
): ?array {
|
||
$content = $this->clean((string) $content);
|
||
if ($content === '') {
|
||
return null;
|
||
}
|
||
|
||
return [
|
||
'block_uid' => $this->uid('block', implode('|', [
|
||
$payload['source'],
|
||
$payload['title'],
|
||
(string) $pageNumber,
|
||
$sourceIndex,
|
||
$content,
|
||
])),
|
||
'index' => $blockIndex,
|
||
'page_number' => $pageNumber,
|
||
'content' => $content,
|
||
'metadata' => $metadata,
|
||
];
|
||
}
|
||
|
||
private function chunkLongUnit(string $text, int $chunkSize, int $chunkOverlap): array
|
||
{
|
||
if (mb_strlen($text) <= $chunkSize) {
|
||
return [$text];
|
||
}
|
||
|
||
$chunks = [];
|
||
$current = '';
|
||
|
||
foreach ($this->semanticUnits($text) as $unit) {
|
||
if (mb_strlen($unit) > $chunkSize) {
|
||
if ($current !== '') {
|
||
$chunks[] = $current;
|
||
$current = '';
|
||
}
|
||
|
||
array_push($chunks, ...$this->hardChunk($unit, $chunkSize, $chunkOverlap));
|
||
continue;
|
||
}
|
||
|
||
$candidate = $current === '' ? $unit : $current . ' ' . $unit;
|
||
if (mb_strlen($candidate) <= $chunkSize) {
|
||
$current = $candidate;
|
||
continue;
|
||
}
|
||
|
||
if ($current !== '') {
|
||
$chunks[] = $current;
|
||
}
|
||
$current = $unit;
|
||
}
|
||
|
||
if ($current !== '') {
|
||
$chunks[] = $current;
|
||
}
|
||
|
||
return $chunks === [] ? [$text] : $chunks;
|
||
}
|
||
|
||
private function semanticUnits(string $text): array
|
||
{
|
||
$units = preg_split(self::SENTENCE_BOUNDARY_PATTERN, $text, -1, PREG_SPLIT_NO_EMPTY);
|
||
if ($units === false || $units === []) {
|
||
return [$text];
|
||
}
|
||
|
||
return array_values(array_filter(array_map(fn (string $unit): string => $this->clean($unit), $units)));
|
||
}
|
||
|
||
private function hardChunk(string $text, int $chunkSize, int $chunkOverlap): array
|
||
{
|
||
$length = mb_strlen($text);
|
||
if ($length <= $chunkSize) {
|
||
return [$text];
|
||
}
|
||
|
||
$chunks = [];
|
||
$start = 0;
|
||
while ($start < $length) {
|
||
$chunk = mb_substr($text, $start, $chunkSize);
|
||
if ($chunk === '') {
|
||
break;
|
||
}
|
||
|
||
$chunks[] = $chunk;
|
||
if ($start + $chunkSize >= $length) {
|
||
break;
|
||
}
|
||
|
||
$start += $chunkSize - $chunkOverlap;
|
||
}
|
||
|
||
return $chunks;
|
||
}
|
||
|
||
private function hasPageNumber(array $item): bool
|
||
{
|
||
return array_key_exists('page_number', $item)
|
||
|| array_key_exists('page', $item)
|
||
|| array_key_exists('number', $item);
|
||
}
|
||
|
||
private function pageNumber(array $item): int|string
|
||
{
|
||
$pageNumber = $item['page_number'] ?? $item['page'] ?? $item['number'];
|
||
return is_int($pageNumber) ? $pageNumber : $this->clean((string) $pageNumber);
|
||
}
|
||
|
||
private function restorePageNumber(string $pageNumber): int|string|null
|
||
{
|
||
if ($pageNumber === '') {
|
||
return null;
|
||
}
|
||
|
||
return ctype_digit($pageNumber) ? (int) $pageNumber : $pageNumber;
|
||
}
|
||
|
||
private function intOption(array $payload, string $key, int $default): int
|
||
{
|
||
if (!isset($payload[$key]) || $payload[$key] === '') {
|
||
return $default;
|
||
}
|
||
|
||
return (int) $payload[$key];
|
||
}
|
||
|
||
private function clean(string $value): string
|
||
{
|
||
return trim(preg_replace('/[ \t]+/u', ' ', $value) ?? $value);
|
||
}
|
||
|
||
private function nullableClean(mixed $value): ?string
|
||
{
|
||
if (!is_string($value)) {
|
||
return null;
|
||
}
|
||
|
||
$value = $this->clean($value);
|
||
return $value === '' ? null : $value;
|
||
}
|
||
|
||
private function tagsFromString(string $value): array
|
||
{
|
||
$value = trim($value);
|
||
if ($value === '') {
|
||
return [];
|
||
}
|
||
|
||
$decoded = json_decode($value, true);
|
||
if (is_array($decoded)) {
|
||
return array_values(array_filter(array_map('strval', $decoded)));
|
||
}
|
||
|
||
return array_values(array_filter(array_map('trim', preg_split('/[,,]/u', $value) ?: [])));
|
||
}
|
||
|
||
private function archiveUid(array $payload): string
|
||
{
|
||
if (isset($payload['archive_uid']) && is_string($payload['archive_uid']) && $this->isUlid($payload['archive_uid'])) {
|
||
return strtoupper($payload['archive_uid']);
|
||
}
|
||
|
||
return (string) new Ulid();
|
||
}
|
||
|
||
private function chunkUid(string $archiveUid, int $chunkIndex, string $value): string
|
||
{
|
||
return $archiveUid . '_' . $chunkIndex . '_' . $this->shortUid($value);
|
||
}
|
||
|
||
private function shortUid(string $value): string
|
||
{
|
||
$number = hexdec(substr(hash('crc32b', $value), 0, 8)) % 100000;
|
||
return str_pad((string) $number, 5, '0', STR_PAD_LEFT);
|
||
}
|
||
|
||
private function isUlid(string $value): bool
|
||
{
|
||
return (bool) preg_match('/^[0-9A-HJKMNP-TV-Z]{26}$/', strtoupper($value));
|
||
}
|
||
|
||
private function uid(string $prefix, string $value): string
|
||
{
|
||
return $prefix . '_' . substr(hash('sha256', $value), 0, 24);
|
||
}
|
||
}
|