\s*(?:#+\s*Page\s+\S+\s*)?(.*?)(?=\n---\s*\n\s*/iu', '', $content) ?? $content; $content = preg_replace('/^\s*#+\s*Page\s+\S+\s*$/imu', '', $content) ?? $content; $content = preg_replace('/^\s*---+\s*$/mu', '', $content) ?? $content; $content = html_entity_decode($content, ENT_QUOTES | ENT_HTML5, 'UTF-8'); return trim($content); } private function cleanMarkdownBlock(string $block): string { $block = preg_replace('/[ \t]+/u', ' ', $block) ?? $block; $block = preg_replace('/\R[ \t]+/u', "\n", $block) ?? $block; return trim($block); } private function isCommentBlock(string $block): bool { return (bool) preg_match('/^\s*(?:[*#\s_~`>-])*COMMENT\b/iu', $this->plainBlock($block)); } private function isPolicyRecordBlock(string $block): bool { return (bool) preg_match('/^\s*(?:NSAM|NSDM|NSDD|NSD|PD)\s+\d+/iu', $this->plainBlock($block)); } private function isNoiseBlock(string $block): bool { $plain = strtoupper($this->plainBlock($block)); $plain = preg_replace('/\s+/u', ' ', $plain) ?? $plain; if ($plain === '') { return true; } if (preg_match('/^\d{1,6}$/', $plain)) { return true; } $patterns = [ '/^(?:# )?UNCLASSIFIED$/', '/^TOP SECRET$/', '/^UNCLASSIFIED WITH TOP SECRET ATTACHMENTS$/', '/^DECLASSIFY ON: OADR$/', '/^\*? ?UNCLASSIFIED\*?$/', ]; foreach ($patterns as $pattern) { if (preg_match($pattern, $plain)) { return true; } } return false; } private function cleanEmbeddingText(string $text): string { $lines = preg_split('/\R/u', $text); if ($lines === false) { return $this->cleanMarkdownBlock($text); } $kept = []; foreach ($lines as $line) { $line = trim($line); if ($line === '' || $this->isNoiseLine($line)) { continue; } $kept[] = $line; } return $this->cleanMarkdownBlock(implode("\n", $kept)); } private function isNoiseLine(string $line): bool { $plain = strtoupper($this->plainBlock($line)); $plain = preg_replace('/\s+/u', ' ', $plain) ?? $plain; if ($plain === '' || preg_match('/^\d{1,6}$/', $plain)) { return true; } $patterns = [ '/^(?:# )?UNCLASSIFIED$/', '/^TOP SECRET$/', '/^UNCLASSIFIED WITH TOP SECRET ATTACHMENTS$/', '/^DECLASSIFY ON: OADR$/', '/^PARTIALLY DECLASSIFIED\/RELEASED ON .+$/', '/^UNDER PROVISIONS OF .+$/', '/^BY .+ NATIONAL SECURITY COUNCIL$/', '/^F \d{2}-\d+$/', ]; foreach ($patterns as $pattern) { if (preg_match($pattern, $plain)) { return true; } } return false; } private function plainBlock(string $block): string { $block = str_replace(['*', '_', '`', '~'], '', $block); $block = preg_replace('/\s+/u', ' ', $block) ?? $block; return trim($block); } private function inferTitle(string $markdown, string $source): string { if (preg_match_all('/^\s*#+\s*(.+?)\s*$/imu', $markdown, $matches)) { foreach ($matches[1] as $heading) { $heading = $this->clean($heading); if (!preg_match('/^Page\s+\S+$/iu', $heading) && !$this->isNoiseLine($heading)) { return $heading; } } } if ($source !== '') { return pathinfo($source, PATHINFO_FILENAME) ?: $source; } return 'Untitled Markdown Import'; } private function pageBlocksFromItems(array $payload, array $items): array { $pageBlocks = []; foreach ($items as $index => $item) { $content = is_array($item) ? ($item['content'] ?? '') : $item; $pageNumber = is_array($item) && $this->hasPageNumber($item) ? $this->pageNumber($item) : null; $metadata = is_array($item) ? ($item['metadata'] ?? []) : []; $pageBlock = $this->pageBlock($payload, $content, count($pageBlocks), $index, $pageNumber, $metadata); if ($pageBlock !== null) { $pageBlocks[] = $pageBlock; } } return $pageBlocks; } private function pageBlock( array $payload, mixed $content, int $blockIndex, int $sourceIndex, int|string|null $pageNumber, array $metadata ): ?array { $content = $this->clean((string) $content); if ($content === '') { return null; } return [ 'block_uid' => $this->uid('block', implode('|', [ $payload['source'], $payload['title'], (string) $pageNumber, $sourceIndex, $content, ])), 'index' => $blockIndex, 'page_number' => $pageNumber, 'content' => $content, 'metadata' => $metadata, ]; } private function chunkLongUnit(string $text, int $chunkSize, int $chunkOverlap): array { if (mb_strlen($text) <= $chunkSize) { return [$text]; } $chunks = []; $current = ''; foreach ($this->semanticUnits($text) as $unit) { if (mb_strlen($unit) > $chunkSize) { if ($current !== '') { $chunks[] = $current; $current = ''; } array_push($chunks, ...$this->hardChunk($unit, $chunkSize, $chunkOverlap)); continue; } $candidate = $current === '' ? $unit : $current . ' ' . $unit; if (mb_strlen($candidate) <= $chunkSize) { $current = $candidate; continue; } if ($current !== '') { $chunks[] = $current; } $current = $unit; } if ($current !== '') { $chunks[] = $current; } return $chunks === [] ? [$text] : $chunks; } private function semanticUnits(string $text): array { $units = preg_split(self::SENTENCE_BOUNDARY_PATTERN, $text, -1, PREG_SPLIT_NO_EMPTY); if ($units === false || $units === []) { return [$text]; } return array_values(array_filter(array_map(fn (string $unit): string => $this->clean($unit), $units))); } private function hardChunk(string $text, int $chunkSize, int $chunkOverlap): array { $length = mb_strlen($text); if ($length <= $chunkSize) { return [$text]; } $chunks = []; $start = 0; while ($start < $length) { $chunk = mb_substr($text, $start, $chunkSize); if ($chunk === '') { break; } $chunks[] = $chunk; if ($start + $chunkSize >= $length) { break; } $start += $chunkSize - $chunkOverlap; } return $chunks; } private function hasPageNumber(array $item): bool { return array_key_exists('page_number', $item) || array_key_exists('page', $item) || array_key_exists('number', $item); } private function pageNumber(array $item): int|string { $pageNumber = $item['page_number'] ?? $item['page'] ?? $item['number']; return is_int($pageNumber) ? $pageNumber : $this->clean((string) $pageNumber); } private function restorePageNumber(string $pageNumber): int|string|null { if ($pageNumber === '') { return null; } return ctype_digit($pageNumber) ? (int) $pageNumber : $pageNumber; } private function intOption(array $payload, string $key, int $default): int { if (!isset($payload[$key]) || $payload[$key] === '') { return $default; } return (int) $payload[$key]; } private function clean(string $value): string { return trim(preg_replace('/[ \t]+/u', ' ', $value) ?? $value); } private function nullableClean(mixed $value): ?string { if (!is_string($value)) { return null; } $value = $this->clean($value); return $value === '' ? null : $value; } private function tagsFromString(string $value): array { $value = trim($value); if ($value === '') { return []; } $decoded = json_decode($value, true); if (is_array($decoded)) { return array_values(array_filter(array_map('strval', $decoded))); } return array_values(array_filter(array_map('trim', preg_split('/[,,]/u', $value) ?: []))); } private function archiveUid(array $payload): string { if (isset($payload['archive_uid']) && is_string($payload['archive_uid']) && $this->isUlid($payload['archive_uid'])) { return strtoupper($payload['archive_uid']); } return (string) new Ulid(); } private function chunkUid(string $archiveUid, int $chunkIndex, string $value): string { return $archiveUid . '_' . $chunkIndex . '_' . $this->shortUid($value); } private function shortUid(string $value): string { $number = hexdec(substr(hash('crc32b', $value), 0, 8)) % 100000; return str_pad((string) $number, 5, '0', STR_PAD_LEFT); } private function isUlid(string $value): bool { return (bool) preg_match('/^[0-9A-HJKMNP-TV-Z]{26}$/', strtoupper($value)); } private function uid(string $prefix, string $value): string { return $prefix . '_' . substr(hash('sha256', $value), 0, 24); } }