暂存
This commit is contained in:
parent
093ce2e8bc
commit
549b706fcc
5
.env
5
.env
@ -6,4 +6,7 @@ LLM_CHAT_TEMPERATURE=0.2
|
||||
LLM_METADATA_ENABLED="true"
|
||||
LLM_METADATA_MODEL="glm-4.7-flash"
|
||||
LLM_METADATA_MAX_TOKENS=2480
|
||||
LLM_METADATA_TEMPERATURE=0.1
|
||||
LLM_METADATA_TEMPERATURE=0.1
|
||||
OPENSEARCH_HOST="http://localhost:9200"
|
||||
OPENSEARCH_USERNAME="admin"
|
||||
OPENSEARCH_PASSWORD="proofdb"
|
||||
18
Dockerfile
18
Dockerfile
@ -1,18 +0,0 @@
|
||||
FROM php:8.3.22-cli-alpine
|
||||
|
||||
RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini"
|
||||
|
||||
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.aliyun.com/g' /etc/apk/repositories \
|
||||
&& apk update --no-cache \
|
||||
&& docker-php-source extract
|
||||
|
||||
# install extensions
|
||||
RUN docker-php-ext-install pdo pdo_mysql -j$(nproc) pcntl
|
||||
|
||||
# enable opcache and pcntl
|
||||
RUN docker-php-ext-enable opcache pcntl
|
||||
RUN docker-php-source delete \
|
||||
rm -rf /var/cache/apk/*
|
||||
|
||||
RUN mkdir -p /app
|
||||
WORKDIR /app
|
||||
@ -14,7 +14,7 @@
|
||||
runtime/proofdb/imports/{import_uid}.json
|
||||
```
|
||||
|
||||
后续接入 MySQL、OpenSearch、Vector DB 时,会沿用当前返回结构中的 UID。
|
||||
后续接入 PostgreSQL、OpenSearch、Vector DB 时,会沿用当前返回结构中的 UID。
|
||||
|
||||
## 请求方式
|
||||
|
||||
|
||||
395
apidoc/searchapi.md
Normal file
395
apidoc/searchapi.md
Normal file
@ -0,0 +1,395 @@
|
||||
# 搜索 API
|
||||
|
||||
## 接口说明
|
||||
|
||||
Proof DB 的搜索接口基于 OpenSearch `proofdb_chunks` 索引。当前版本已实现全文搜索,检索单位是 chunk,返回结果包含档案元数据、页码范围和 chunk 文本,便于后续 evidence reconstruction。
|
||||
|
||||
OpenSearch 中每个 chunk 文档同时包含:
|
||||
|
||||
- `text` 等全文字段,用于 BM25 检索。
|
||||
- `embedding` 2048 维向量字段,用于后续 vector / hybrid 检索。
|
||||
|
||||
## 全文搜索
|
||||
|
||||
```http
|
||||
POST /api/search/fulltext
|
||||
```
|
||||
|
||||
### 请求格式
|
||||
|
||||
`Content-Type: application/json`
|
||||
|
||||
| 字段 | 类型 | 必填 | 说明 |
|
||||
| --- | --- | --- | --- |
|
||||
| `query` | string | 是 | 搜索关键词或短语 |
|
||||
| `limit` | int | 否 | 返回条数,默认 `10`,最大 `50` |
|
||||
| `filters` | object | 否 | 过滤条件 |
|
||||
| `filters.archive_uid` | string | 否 | 只搜索某个 archive |
|
||||
| `filters.chunk_uid` | string | 否 | 只搜索某个 chunk |
|
||||
| `filters.source` | string | 否 | 精确匹配 source |
|
||||
| `filters.author` | string | 否 | 精确匹配 author |
|
||||
| `filters.series` | string | 否 | 精确匹配 series |
|
||||
| `filters.year` | int | 否 | 精确匹配年份 |
|
||||
| `filters.tags` | string\|array | 否 | 匹配一个或多个 tag |
|
||||
|
||||
### 请求示例
|
||||
|
||||
```bash
|
||||
curl -X POST http://127.0.0.1:8787/api/search/fulltext \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"query": "policy documents",
|
||||
"limit": 5
|
||||
}'
|
||||
```
|
||||
|
||||
带过滤条件:
|
||||
|
||||
```bash
|
||||
curl -X POST http://127.0.0.1:8787/api/search/fulltext \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"query": "Iraq Kuwait",
|
||||
"limit": 10,
|
||||
"filters": {
|
||||
"year": 1992,
|
||||
"tags": ["NSD 76"]
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
### 成功响应
|
||||
|
||||
状态码:
|
||||
|
||||
```http
|
||||
200 OK
|
||||
```
|
||||
|
||||
响应示例:
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 0,
|
||||
"message": "Full-text search completed.",
|
||||
"data": {
|
||||
"mode": "fulltext",
|
||||
"query": "policy documents",
|
||||
"limit": 5,
|
||||
"filters": [],
|
||||
"total": 1,
|
||||
"hits": [
|
||||
{
|
||||
"score": 12.34,
|
||||
"chunk_uid": "01KQHVREB6XPYF604RVZAP9NNY_1_39003",
|
||||
"archive_uid": "01KQHVREB6XPYF604RVZAP9NNY",
|
||||
"chunk_index": 1,
|
||||
"page_start": 1,
|
||||
"page_end": 1,
|
||||
"title": "NSD 76 Disposition of NSC Policy Documents",
|
||||
"source": "archive://nsc/nsd-76",
|
||||
"author": "Brent Scowcroft",
|
||||
"year": 1992,
|
||||
"series": null,
|
||||
"tags": ["NSD 76", "政策文件"],
|
||||
"text": "chunk text...",
|
||||
"embedding_model": "embedding-3",
|
||||
"embedding_dimensions": 2048
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 错误响应
|
||||
|
||||
#### JSON 格式错误
|
||||
|
||||
状态码:
|
||||
|
||||
```http
|
||||
400 Bad Request
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 400,
|
||||
"message": "Invalid JSON body.",
|
||||
"errors": {
|
||||
"body": "Syntax error"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 缺少 query
|
||||
|
||||
状态码:
|
||||
|
||||
```http
|
||||
422 Unprocessable Entity
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 422,
|
||||
"message": "Search request validation failed.",
|
||||
"errors": {
|
||||
"query": "query is required."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 搜索失败
|
||||
|
||||
状态码:
|
||||
|
||||
```http
|
||||
500 Internal Server Error
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 500,
|
||||
"message": "Full-text search failed.",
|
||||
"errors": {
|
||||
"search": "error message"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 后续接口
|
||||
|
||||
## 向量搜索
|
||||
|
||||
```http
|
||||
POST /api/search/vector
|
||||
```
|
||||
|
||||
### 请求格式
|
||||
|
||||
`Content-Type: application/json`
|
||||
|
||||
| 字段 | 类型 | 必填 | 说明 |
|
||||
| --- | --- | --- | --- |
|
||||
| `query` | string | 是 | 搜索语句。系统会先调用智谱 `embedding-3` 转成 2048 维向量 |
|
||||
| `limit` | int | 否 | 返回条数,默认 `10`,最大 `50` |
|
||||
| `k` | int | 否 | OpenSearch kNN 候选数,默认等于 `limit`,最大 `50` |
|
||||
| `filters` | object | 否 | 过滤条件,同全文搜索 |
|
||||
|
||||
### 请求示例
|
||||
|
||||
```bash
|
||||
curl -X POST http://127.0.0.1:8787/api/search/vector \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"query": "Iraq invasion and Desert Storm",
|
||||
"limit": 5,
|
||||
"k": 10
|
||||
}'
|
||||
```
|
||||
|
||||
中文 query 也可以提交给向量搜索:
|
||||
|
||||
```bash
|
||||
curl -X POST http://127.0.0.1:8787/api/search/vector \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"query": "伊拉克入侵科威特与沙漠风暴",
|
||||
"limit": 5
|
||||
}'
|
||||
```
|
||||
|
||||
### 成功响应
|
||||
|
||||
状态码:
|
||||
|
||||
```http
|
||||
200 OK
|
||||
```
|
||||
|
||||
响应示例:
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 0,
|
||||
"message": "Vector search completed.",
|
||||
"data": {
|
||||
"mode": "vector",
|
||||
"query": "Iraq invasion and Desert Storm",
|
||||
"limit": 5,
|
||||
"k": 10,
|
||||
"filters": [],
|
||||
"embedding_model": "embedding-3",
|
||||
"embedding_dimensions": 2048,
|
||||
"total": 5,
|
||||
"hits": [
|
||||
{
|
||||
"score": 0.91,
|
||||
"chunk_uid": "01KQHVREB6XPYF604RVZAP9NNY_14_97554",
|
||||
"archive_uid": "01KQHVREB6XPYF604RVZAP9NNY",
|
||||
"chunk_index": 14,
|
||||
"page_start": 8,
|
||||
"page_end": 8,
|
||||
"title": "NSD 76 Disposition of NSC Policy Documents",
|
||||
"source": "archive://nsc/nsd-76",
|
||||
"author": "Brent Scowcroft",
|
||||
"year": 1992,
|
||||
"series": null,
|
||||
"tags": ["NSD 76", "政策文件"],
|
||||
"text": "chunk text...",
|
||||
"embedding_model": "embedding-3",
|
||||
"embedding_dimensions": 2048
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### 错误响应
|
||||
|
||||
错误响应格式与全文搜索一致。常见错误包括:
|
||||
|
||||
- JSON 格式错误:`400 Bad Request`
|
||||
- 缺少 `query`:`422 Unprocessable Entity`
|
||||
- embedding API 或 OpenSearch 查询失败:`500 Internal Server Error`
|
||||
|
||||
## 后续接口
|
||||
|
||||
## 混合搜索
|
||||
|
||||
```http
|
||||
POST /api/search/hybrid
|
||||
```
|
||||
|
||||
### 接口说明
|
||||
|
||||
混合搜索会对同一个 `query` 同时执行:
|
||||
|
||||
1. BM25 全文搜索。
|
||||
2. 2048 维向量 kNN 搜索。
|
||||
3. 使用 Reciprocal Rank Fusion(RRF)合并排序。
|
||||
|
||||
第一版不做额外 reranker。RRF 不直接比较 BM25 分数和向量分数,而是根据两路结果中的排名融合,适合作为稳定的 hybrid baseline。
|
||||
|
||||
### 请求格式
|
||||
|
||||
`Content-Type: application/json`
|
||||
|
||||
| 字段 | 类型 | 必填 | 说明 |
|
||||
| --- | --- | --- | --- |
|
||||
| `query` | string | 是 | 搜索语句 |
|
||||
| `limit` | int | 否 | 最终返回条数,默认 `10`,最大 `50` |
|
||||
| `candidate_limit` | int | 否 | 每一路召回候选数,默认 `max(limit * 3, 20)`,最大 `50` |
|
||||
| `rrf_k` | int | 否 | RRF 平滑参数,默认 `60` |
|
||||
| `filters` | object | 否 | 过滤条件,同全文搜索 |
|
||||
| `ai` | bool | 否 | 默认 `false`。传 `true` 时,系统先调用现有 LLM chat 通道把原始 query 改写为 BM25 关键词;全文搜索使用 AI 关键词,向量搜索仍使用原始 query |
|
||||
|
||||
如果 AI 关键词生成失败或超时,系统会回退为使用原始 `query` 做全文搜索,并在响应的 `keywords.error` 中返回错误信息;向量搜索不受影响。
|
||||
|
||||
### 请求示例
|
||||
|
||||
```bash
|
||||
curl -X POST http://127.0.0.1:8787/api/search/hybrid \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"query": "Iraq invasion and Desert Storm",
|
||||
"limit": 5,
|
||||
"candidate_limit": 20
|
||||
}'
|
||||
```
|
||||
|
||||
中文 query:
|
||||
|
||||
```bash
|
||||
curl -X POST http://127.0.0.1:8787/api/search/hybrid \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"query": "伊拉克入侵科威特与沙漠风暴",
|
||||
"limit": 5,
|
||||
"ai": true
|
||||
}'
|
||||
```
|
||||
|
||||
### 成功响应
|
||||
|
||||
状态码:
|
||||
|
||||
```http
|
||||
200 OK
|
||||
```
|
||||
|
||||
响应示例:
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 0,
|
||||
"message": "Hybrid search completed.",
|
||||
"data": {
|
||||
"mode": "hybrid",
|
||||
"query": "Iraq invasion and Desert Storm",
|
||||
"limit": 5,
|
||||
"candidate_limit": 20,
|
||||
"rrf_k": 60,
|
||||
"filters": [],
|
||||
"ai": true,
|
||||
"fulltext_query": "Iraq Kuwait invasion Desert Storm",
|
||||
"vector_query": "Iraq invasion and Desert Storm",
|
||||
"keywords": {
|
||||
"enabled": true,
|
||||
"attempted": true,
|
||||
"error": null,
|
||||
"keywords": ["Iraq", "Kuwait", "invasion", "Desert Storm"],
|
||||
"query": "Iraq Kuwait invasion Desert Storm",
|
||||
"model": "glm-4.7-flash"
|
||||
},
|
||||
"total": 10,
|
||||
"sources": {
|
||||
"fulltext_total": 1,
|
||||
"vector_total": 20,
|
||||
"fulltext_hits": 1,
|
||||
"vector_hits": 20
|
||||
},
|
||||
"hits": [
|
||||
{
|
||||
"score": 4.13,
|
||||
"hybrid_score": 0.0325,
|
||||
"rank_sources": {
|
||||
"fulltext": {
|
||||
"rank": 1,
|
||||
"score": 4.13,
|
||||
"rrf": 0.0163934426
|
||||
},
|
||||
"vector": {
|
||||
"rank": 1,
|
||||
"score": 0.79,
|
||||
"rrf": 0.0163934426
|
||||
}
|
||||
},
|
||||
"chunk_uid": "01KQHVREB6XPYF604RVZAP9NNY_14_97554",
|
||||
"archive_uid": "01KQHVREB6XPYF604RVZAP9NNY",
|
||||
"page_start": 8,
|
||||
"page_end": 8,
|
||||
"text": "chunk text..."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 错误响应
|
||||
|
||||
错误响应格式与全文搜索一致。常见错误包括:
|
||||
|
||||
- JSON 格式错误:`400 Bad Request`
|
||||
- 缺少 `query`:`422 Unprocessable Entity`
|
||||
- embedding API、全文搜索或向量搜索失败:`500 Internal Server Error`
|
||||
|
||||
## 后续接口
|
||||
|
||||
以下能力尚未实现:
|
||||
|
||||
```http
|
||||
GET /api/chunks/{chunk_uid}
|
||||
GET /api/evidence/{chunk_uid}
|
||||
```
|
||||
132
app/controller/Api/SearchController.php
Normal file
132
app/controller/Api/SearchController.php
Normal file
@ -0,0 +1,132 @@
|
||||
<?php
|
||||
|
||||
namespace app\controller\Api;
|
||||
|
||||
use app\service\Search\OpenSearchSearchService;
|
||||
use InvalidArgumentException;
|
||||
use JsonException;
|
||||
use support\Request;
|
||||
use support\Response;
|
||||
use Throwable;
|
||||
|
||||
class SearchController
|
||||
{
|
||||
public function fulltext(Request $request): Response
|
||||
{
|
||||
try {
|
||||
$payload = $this->jsonPayload($request);
|
||||
$data = (new OpenSearchSearchService())->fulltext($payload);
|
||||
} catch (JsonException $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 400,
|
||||
'message' => 'Invalid JSON body.',
|
||||
'errors' => ['body' => $exception->getMessage()],
|
||||
], 400);
|
||||
} catch (InvalidArgumentException $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 422,
|
||||
'message' => 'Search request validation failed.',
|
||||
'errors' => ['query' => $exception->getMessage()],
|
||||
], 422);
|
||||
} catch (Throwable $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 500,
|
||||
'message' => 'Full-text search failed.',
|
||||
'errors' => ['search' => $exception->getMessage()],
|
||||
], 500);
|
||||
}
|
||||
|
||||
return $this->jsonResponse([
|
||||
'code' => 0,
|
||||
'message' => 'Full-text search completed.',
|
||||
'data' => $data,
|
||||
], 200);
|
||||
}
|
||||
|
||||
public function vector(Request $request): Response
|
||||
{
|
||||
try {
|
||||
$payload = $this->jsonPayload($request);
|
||||
$data = (new OpenSearchSearchService())->vector($payload);
|
||||
} catch (JsonException $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 400,
|
||||
'message' => 'Invalid JSON body.',
|
||||
'errors' => ['body' => $exception->getMessage()],
|
||||
], 400);
|
||||
} catch (InvalidArgumentException $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 422,
|
||||
'message' => 'Search request validation failed.',
|
||||
'errors' => ['query' => $exception->getMessage()],
|
||||
], 422);
|
||||
} catch (Throwable $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 500,
|
||||
'message' => 'Vector search failed.',
|
||||
'errors' => ['search' => $exception->getMessage()],
|
||||
], 500);
|
||||
}
|
||||
|
||||
return $this->jsonResponse([
|
||||
'code' => 0,
|
||||
'message' => 'Vector search completed.',
|
||||
'data' => $data,
|
||||
], 200);
|
||||
}
|
||||
|
||||
public function hybrid(Request $request): Response
|
||||
{
|
||||
try {
|
||||
$payload = $this->jsonPayload($request);
|
||||
$data = (new OpenSearchSearchService())->hybrid($payload);
|
||||
} catch (JsonException $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 400,
|
||||
'message' => 'Invalid JSON body.',
|
||||
'errors' => ['body' => $exception->getMessage()],
|
||||
], 400);
|
||||
} catch (InvalidArgumentException $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 422,
|
||||
'message' => 'Search request validation failed.',
|
||||
'errors' => ['query' => $exception->getMessage()],
|
||||
], 422);
|
||||
} catch (Throwable $exception) {
|
||||
return $this->jsonResponse([
|
||||
'code' => 500,
|
||||
'message' => 'Hybrid search failed.',
|
||||
'errors' => ['search' => $exception->getMessage()],
|
||||
], 500);
|
||||
}
|
||||
|
||||
return $this->jsonResponse([
|
||||
'code' => 0,
|
||||
'message' => 'Hybrid search completed.',
|
||||
'data' => $data,
|
||||
], 200);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws JsonException
|
||||
*/
|
||||
private function jsonPayload(Request $request): array
|
||||
{
|
||||
$rawBody = trim($request->rawBody());
|
||||
if ($rawBody === '') {
|
||||
return $request->post();
|
||||
}
|
||||
|
||||
$payload = json_decode($rawBody, true, 512, JSON_THROW_ON_ERROR);
|
||||
return is_array($payload) ? $payload : [];
|
||||
}
|
||||
|
||||
private function jsonResponse(array $data, int $status): Response
|
||||
{
|
||||
return response(
|
||||
json_encode($data, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_THROW_ON_ERROR),
|
||||
$status,
|
||||
['Content-Type' => 'application/json']
|
||||
);
|
||||
}
|
||||
}
|
||||
57
app/process/ProofDbTaskDispatcher.php
Normal file
57
app/process/ProofDbTaskDispatcher.php
Normal file
@ -0,0 +1,57 @@
|
||||
<?php
|
||||
|
||||
namespace app\process;
|
||||
|
||||
use app\service\Embedding\ChunkEmbeddingRepository;
|
||||
use app\service\Search\ChunkSearchIndexRepository;
|
||||
use app\service\Task\ProofDbTaskQueue;
|
||||
use Throwable;
|
||||
|
||||
class ProofDbTaskDispatcher
|
||||
{
|
||||
private ProofDbTaskQueue $queue;
|
||||
private ChunkEmbeddingRepository $embeddings;
|
||||
private ChunkSearchIndexRepository $searchIndex;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->queue = new ProofDbTaskQueue();
|
||||
$this->embeddings = new ChunkEmbeddingRepository();
|
||||
$this->searchIndex = new ChunkSearchIndexRepository();
|
||||
}
|
||||
|
||||
public function onWorkerStart(): void
|
||||
{
|
||||
while (true) {
|
||||
$this->dispatchOnce();
|
||||
sleep($this->queue->dispatcherIntervalSeconds());
|
||||
}
|
||||
}
|
||||
|
||||
private function dispatchOnce(): void
|
||||
{
|
||||
try {
|
||||
foreach ($this->embeddings->queuePendingArchiveTasks($this->queue->dispatcherBatchSize()) as $archiveUid) {
|
||||
$this->queue->push([
|
||||
'task_type' => 'embedding',
|
||||
'target_type' => 'archive',
|
||||
'target_uid' => $archiveUid,
|
||||
'attempt' => 1,
|
||||
'queued_at' => date(DATE_ATOM),
|
||||
]);
|
||||
}
|
||||
|
||||
foreach ($this->searchIndex->queuePendingArchiveTasks($this->queue->dispatcherBatchSize()) as $archiveUid) {
|
||||
$this->queue->push([
|
||||
'task_type' => 'search_index',
|
||||
'target_type' => 'archive',
|
||||
'target_uid' => $archiveUid,
|
||||
'attempt' => 1,
|
||||
'queued_at' => date(DATE_ATOM),
|
||||
]);
|
||||
}
|
||||
} catch (Throwable $exception) {
|
||||
sleep($this->queue->idleSleepSeconds());
|
||||
}
|
||||
}
|
||||
}
|
||||
56
app/process/ProofDbTaskWorker.php
Normal file
56
app/process/ProofDbTaskWorker.php
Normal file
@ -0,0 +1,56 @@
|
||||
<?php
|
||||
|
||||
namespace app\process;
|
||||
|
||||
use app\service\Embedding\ChunkEmbeddingHandler;
|
||||
use app\service\Search\ChunkSearchIndexHandler;
|
||||
use app\service\Task\ProofDbTaskQueue;
|
||||
use Throwable;
|
||||
use Workerman\Timer;
|
||||
|
||||
class ProofDbTaskWorker
|
||||
{
|
||||
private ProofDbTaskQueue $queue;
|
||||
private ChunkEmbeddingHandler $embeddings;
|
||||
private ChunkSearchIndexHandler $searchIndex;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->queue = new ProofDbTaskQueue();
|
||||
$this->embeddings = new ChunkEmbeddingHandler();
|
||||
$this->searchIndex = new ChunkSearchIndexHandler();
|
||||
}
|
||||
|
||||
public function onWorkerStart(): void
|
||||
{
|
||||
Timer::add(10, fn (): int => $this->queue->releaseDueDelayed());
|
||||
|
||||
while (true) {
|
||||
$this->queue->releaseDueDelayed();
|
||||
$task = $this->queue->pop($this->queue->blockTimeout());
|
||||
if ($task === null) {
|
||||
sleep($this->queue->idleSleepSeconds());
|
||||
continue;
|
||||
}
|
||||
|
||||
$this->handle($task);
|
||||
}
|
||||
}
|
||||
|
||||
private function handle(array $task): void
|
||||
{
|
||||
try {
|
||||
if (($task['task_type'] ?? null) === 'embedding') {
|
||||
$this->embeddings->handle($task);
|
||||
}
|
||||
|
||||
if (($task['task_type'] ?? null) === 'search_index') {
|
||||
$this->searchIndex->handle($task);
|
||||
}
|
||||
|
||||
$this->queue->clearRetry($task);
|
||||
} catch (Throwable $exception) {
|
||||
$this->queue->retryLater($task, $exception->getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -43,6 +43,11 @@ class ArchiveRepository
|
||||
'embedding_status' => 0,
|
||||
'embedding_ref' => null,
|
||||
'embedding_model' => null,
|
||||
'embedding_error' => null,
|
||||
'embedding_updated_at' => null,
|
||||
'search_index_status' => 0,
|
||||
'search_index_error' => null,
|
||||
'search_index_updated_at' => null,
|
||||
]);
|
||||
}
|
||||
});
|
||||
|
||||
125
app/service/Embedding/BigModelEmbeddingClient.php
Normal file
125
app/service/Embedding/BigModelEmbeddingClient.php
Normal file
@ -0,0 +1,125 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Embedding;
|
||||
|
||||
use app\service\LLM\LLMRequestException;
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use RuntimeException;
|
||||
use Throwable;
|
||||
|
||||
class BigModelEmbeddingClient
|
||||
{
|
||||
private Client $client;
|
||||
private array $config;
|
||||
|
||||
public function __construct(?array $config = null)
|
||||
{
|
||||
$this->config = $config ?? config('LLMapi.embedding', []);
|
||||
$baseUrl = rtrim((string) ($this->config['base_url'] ?? ''), '/');
|
||||
|
||||
$this->client = new Client([
|
||||
'base_uri' => $baseUrl . '/',
|
||||
'timeout' => (int) ($this->config['timeout'] ?? 60),
|
||||
'connect_timeout' => (int) ($this->config['connect_timeout'] ?? 10),
|
||||
]);
|
||||
}
|
||||
|
||||
public function isConfigured(): bool
|
||||
{
|
||||
return trim((string) ($this->config['api_key'] ?? '')) !== ''
|
||||
&& trim((string) ($this->config['base_url'] ?? '')) !== '';
|
||||
}
|
||||
|
||||
public function embed(array $texts, array $options = []): array
|
||||
{
|
||||
if (!$this->isConfigured()) {
|
||||
throw new RuntimeException('BigModel embedding API is not configured.');
|
||||
}
|
||||
|
||||
$texts = array_values($texts);
|
||||
if ($texts === []) {
|
||||
return [
|
||||
'model' => (string) ($options['model'] ?? $this->config['model'] ?? 'embedding-3'),
|
||||
'data' => [],
|
||||
'usage' => [],
|
||||
];
|
||||
}
|
||||
|
||||
$batchSize = (int) ($this->config['batch_size'] ?? 64);
|
||||
if (count($texts) > $batchSize) {
|
||||
throw new RuntimeException("Embedding batch exceeds configured batch size {$batchSize}.");
|
||||
}
|
||||
|
||||
$body = [
|
||||
'model' => $options['model'] ?? $this->config['model'] ?? 'embedding-3',
|
||||
'input' => $texts,
|
||||
];
|
||||
|
||||
$dimensions = $options['dimensions'] ?? $this->config['dimensions'] ?? null;
|
||||
if ($dimensions !== null) {
|
||||
$dimensions = (int) $dimensions;
|
||||
if (!in_array($dimensions, [256, 512, 1024, 2048], true)) {
|
||||
throw new RuntimeException('Embedding dimensions must be one of 256, 512, 1024, or 2048.');
|
||||
}
|
||||
$body['dimensions'] = $dimensions;
|
||||
}
|
||||
|
||||
try {
|
||||
$response = $this->client->post('embeddings', [
|
||||
'headers' => [
|
||||
'Authorization' => 'Bearer ' . $this->config['api_key'],
|
||||
'Content-Type' => 'application/json',
|
||||
],
|
||||
'json' => $body,
|
||||
]);
|
||||
} catch (RequestException $exception) {
|
||||
throw $this->requestException($exception);
|
||||
} catch (Throwable $exception) {
|
||||
throw new RuntimeException('BigModel embedding request failed: ' . $exception->getMessage(), 0, $exception);
|
||||
}
|
||||
|
||||
$payload = json_decode((string) $response->getBody(), true);
|
||||
if (!is_array($payload) || !isset($payload['data']) || !is_array($payload['data'])) {
|
||||
throw new RuntimeException('BigModel embedding response is invalid.');
|
||||
}
|
||||
|
||||
return $payload;
|
||||
}
|
||||
|
||||
private function requestException(RequestException $exception): LLMRequestException
|
||||
{
|
||||
$statusCode = $exception->getResponse()?->getStatusCode();
|
||||
$body = $exception->getResponse() ? (string) $exception->getResponse()->getBody() : '';
|
||||
$payload = json_decode($body, true);
|
||||
$providerCode = null;
|
||||
$providerMessage = null;
|
||||
|
||||
if (is_array($payload)) {
|
||||
$providerCode = isset($payload['error']['code']) ? (string) $payload['error']['code'] : null;
|
||||
$providerMessage = isset($payload['error']['message']) ? (string) $payload['error']['message'] : null;
|
||||
|
||||
if ($providerCode === null && isset($payload['code'])) {
|
||||
$providerCode = (string) $payload['code'];
|
||||
}
|
||||
if ($providerMessage === null && isset($payload['message'])) {
|
||||
$providerMessage = (string) $payload['message'];
|
||||
}
|
||||
}
|
||||
|
||||
$message = 'BigModel embedding request failed';
|
||||
if ($statusCode !== null) {
|
||||
$message .= " with HTTP {$statusCode}";
|
||||
}
|
||||
if ($providerCode !== null) {
|
||||
$message .= " and provider code {$providerCode}";
|
||||
}
|
||||
if ($providerMessage !== null) {
|
||||
$message .= ": {$providerMessage}";
|
||||
} else {
|
||||
$message .= ': ' . $exception->getMessage();
|
||||
}
|
||||
|
||||
return new LLMRequestException($message, $statusCode, $providerCode, is_array($payload) ? $payload : null);
|
||||
}
|
||||
}
|
||||
105
app/service/Embedding/ChunkEmbeddingHandler.php
Normal file
105
app/service/Embedding/ChunkEmbeddingHandler.php
Normal file
@ -0,0 +1,105 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Embedding;
|
||||
|
||||
use app\service\LLM\LLMRequestException;
|
||||
use app\service\LLM\LLMRetryQueue;
|
||||
use Throwable;
|
||||
|
||||
class ChunkEmbeddingHandler
|
||||
{
|
||||
private BigModelEmbeddingClient $client;
|
||||
private ChunkEmbeddingRepository $chunks;
|
||||
private LLMRetryQueue $retryQueue;
|
||||
|
||||
public function __construct(
|
||||
?BigModelEmbeddingClient $client = null,
|
||||
?ChunkEmbeddingRepository $chunks = null,
|
||||
?LLMRetryQueue $retryQueue = null
|
||||
) {
|
||||
$this->client = $client ?? new BigModelEmbeddingClient();
|
||||
$this->chunks = $chunks ?? new ChunkEmbeddingRepository();
|
||||
$this->retryQueue = $retryQueue ?? new LLMRetryQueue();
|
||||
}
|
||||
|
||||
public function handle(array $task): void
|
||||
{
|
||||
if (($task['target_type'] ?? null) !== 'archive') {
|
||||
return;
|
||||
}
|
||||
|
||||
$archiveUid = trim((string) ($task['target_uid'] ?? ''));
|
||||
if ($archiveUid === '') {
|
||||
return;
|
||||
}
|
||||
|
||||
$batchSize = (int) config('LLMapi.embedding.batch_size', 32);
|
||||
$chunks = $this->chunks->findQueuedChunks($archiveUid, $batchSize);
|
||||
if ($chunks === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
$chunkUids = array_column($chunks, 'chunk_uid');
|
||||
$this->chunks->markProcessing($chunkUids);
|
||||
|
||||
try {
|
||||
$payload = $this->retryQueue->run(
|
||||
fn (): array => $this->client->embed(array_column($chunks, 'text'), [
|
||||
'model' => config('LLMapi.embedding.model', 'embedding-3'),
|
||||
'dimensions' => config('LLMapi.embedding.dimensions', 2048),
|
||||
]),
|
||||
config('LLMapi.embedding.retry', [])
|
||||
);
|
||||
|
||||
$this->persistEmbeddings($chunks, $payload);
|
||||
} catch (Throwable $exception) {
|
||||
$this->chunks->markFailed($chunkUids, $exception->getMessage(), $this->isRetryable($exception));
|
||||
throw $exception;
|
||||
}
|
||||
}
|
||||
|
||||
private function persistEmbeddings(array $chunks, array $payload): void
|
||||
{
|
||||
$model = (string) ($payload['model'] ?? config('LLMapi.embedding.model', 'embedding-3'));
|
||||
$usage = is_array($payload['usage'] ?? null) ? $payload['usage'] : [];
|
||||
$results = [];
|
||||
|
||||
foreach ($payload['data'] ?? [] as $item) {
|
||||
if (!is_array($item) || !isset($item['index'], $item['embedding']) || !is_array($item['embedding'])) {
|
||||
continue;
|
||||
}
|
||||
$results[(int) $item['index']] = $item['embedding'];
|
||||
}
|
||||
|
||||
foreach ($chunks as $index => $chunk) {
|
||||
if (!isset($results[$index])) {
|
||||
$this->chunks->markFailed([$chunk['chunk_uid']], 'Embedding response missing index ' . $index, true);
|
||||
continue;
|
||||
}
|
||||
|
||||
$embedding = $results[$index];
|
||||
$this->chunks->markEmbedded($chunk['chunk_uid'], [
|
||||
'provider' => 'bigmodel',
|
||||
'model' => $model,
|
||||
'dimensions' => count($embedding),
|
||||
'embedding' => $embedding,
|
||||
'usage' => $usage,
|
||||
'embedded_at' => date(DATE_ATOM),
|
||||
], $model);
|
||||
}
|
||||
}
|
||||
|
||||
private function isRetryable(Throwable $exception): bool
|
||||
{
|
||||
if (!$exception instanceof LLMRequestException) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$statusCode = $exception->statusCode();
|
||||
if ($statusCode === null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return $statusCode === 429 || $statusCode >= 500;
|
||||
}
|
||||
}
|
||||
97
app/service/Embedding/ChunkEmbeddingRepository.php
Normal file
97
app/service/Embedding/ChunkEmbeddingRepository.php
Normal file
@ -0,0 +1,97 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Embedding;
|
||||
|
||||
use support\Db;
|
||||
|
||||
class ChunkEmbeddingRepository
|
||||
{
|
||||
public function queuePendingArchiveTasks(int $limit): array
|
||||
{
|
||||
$statuses = [EmbeddingStatus::PENDING, EmbeddingStatus::QUEUED, EmbeddingStatus::FAILED_RETRYABLE];
|
||||
$archiveUids = Db::table('chunks')
|
||||
->whereIn('embedding_status', $statuses)
|
||||
->select('archive_uid')
|
||||
->groupBy('archive_uid')
|
||||
->orderByRaw('MIN(id)')
|
||||
->limit($limit)
|
||||
->pluck('archive_uid')
|
||||
->all();
|
||||
|
||||
$archiveUids = array_values(array_filter(array_map('strval', $archiveUids)));
|
||||
foreach ($archiveUids as $archiveUid) {
|
||||
Db::table('chunks')
|
||||
->where('archive_uid', $archiveUid)
|
||||
->whereIn('embedding_status', $statuses)
|
||||
->update([
|
||||
'embedding_status' => EmbeddingStatus::QUEUED,
|
||||
'embedding_error' => null,
|
||||
'embedding_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
|
||||
]);
|
||||
}
|
||||
|
||||
return $archiveUids;
|
||||
}
|
||||
|
||||
public function findQueuedChunks(string $archiveUid, int $limit): array
|
||||
{
|
||||
$chunks = Db::table('chunks')
|
||||
->where('archive_uid', $archiveUid)
|
||||
->whereIn('embedding_status', [EmbeddingStatus::QUEUED, EmbeddingStatus::PROCESSING])
|
||||
->orderBy('chunk_index')
|
||||
->limit($limit)
|
||||
->get(['chunk_uid', 'archive_uid', 'chunk_index', 'text'])
|
||||
->all();
|
||||
|
||||
return array_map(static fn (object $chunk): array => [
|
||||
'chunk_uid' => (string) $chunk->chunk_uid,
|
||||
'archive_uid' => (string) $chunk->archive_uid,
|
||||
'chunk_index' => (int) $chunk->chunk_index,
|
||||
'text' => (string) $chunk->text,
|
||||
], $chunks);
|
||||
}
|
||||
|
||||
public function markProcessing(array $chunkUids): void
|
||||
{
|
||||
$this->updateStatus($chunkUids, EmbeddingStatus::PROCESSING);
|
||||
}
|
||||
|
||||
public function markEmbedded(string $chunkUid, array $embeddingRef, string $model): void
|
||||
{
|
||||
Db::table('chunks')->where('chunk_uid', $chunkUid)->update([
|
||||
'embedding_status' => EmbeddingStatus::EMBEDDED,
|
||||
'embedding_ref' => json_encode($embeddingRef, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
|
||||
'embedding_model' => $model,
|
||||
'embedding_error' => null,
|
||||
'embedding_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
|
||||
'search_index_status' => 0,
|
||||
'search_index_error' => null,
|
||||
'search_index_updated_at' => null,
|
||||
]);
|
||||
}
|
||||
|
||||
public function markFailed(array $chunkUids, string $error, bool $retryable): void
|
||||
{
|
||||
if ($chunkUids === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
Db::table('chunks')->whereIn('chunk_uid', $chunkUids)->update([
|
||||
'embedding_status' => $retryable ? EmbeddingStatus::FAILED_RETRYABLE : EmbeddingStatus::FAILED_TERMINAL,
|
||||
'embedding_error' => mb_substr($error, 0, 4000),
|
||||
'embedding_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
|
||||
]);
|
||||
}
|
||||
|
||||
private function updateStatus(array $chunkUids, int $status): void
|
||||
{
|
||||
if ($chunkUids === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
Db::table('chunks')->whereIn('chunk_uid', $chunkUids)->update([
|
||||
'embedding_status' => $status,
|
||||
'embedding_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
|
||||
]);
|
||||
}
|
||||
}
|
||||
13
app/service/Embedding/EmbeddingStatus.php
Normal file
13
app/service/Embedding/EmbeddingStatus.php
Normal file
@ -0,0 +1,13 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Embedding;
|
||||
|
||||
class EmbeddingStatus
|
||||
{
|
||||
public const PENDING = 0;
|
||||
public const QUEUED = 1;
|
||||
public const PROCESSING = 2;
|
||||
public const EMBEDDED = 3;
|
||||
public const FAILED_RETRYABLE = 4;
|
||||
public const FAILED_TERMINAL = 5;
|
||||
}
|
||||
99
app/service/Search/ChunkSearchIndexHandler.php
Normal file
99
app/service/Search/ChunkSearchIndexHandler.php
Normal file
@ -0,0 +1,99 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Search;
|
||||
|
||||
use Throwable;
|
||||
|
||||
class ChunkSearchIndexHandler
|
||||
{
|
||||
private ChunkSearchIndexRepository $chunks;
|
||||
private OpenSearchChunkIndex $index;
|
||||
|
||||
public function __construct(
|
||||
?ChunkSearchIndexRepository $chunks = null,
|
||||
?OpenSearchChunkIndex $index = null
|
||||
) {
|
||||
$this->chunks = $chunks ?? new ChunkSearchIndexRepository();
|
||||
$this->index = $index ?? new OpenSearchChunkIndex();
|
||||
}
|
||||
|
||||
public function handle(array $task): void
|
||||
{
|
||||
if (($task['target_type'] ?? null) !== 'archive') {
|
||||
return;
|
||||
}
|
||||
|
||||
$archiveUid = trim((string) ($task['target_uid'] ?? ''));
|
||||
if ($archiveUid === '') {
|
||||
return;
|
||||
}
|
||||
|
||||
$documents = $this->chunks->findQueuedDocuments($archiveUid, (int) config('opensearch.bulk.chunk_size', 500));
|
||||
if ($documents === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
$chunkUids = array_column($documents, 'chunk_uid');
|
||||
$this->chunks->markIndexing($chunkUids);
|
||||
|
||||
try {
|
||||
$documents = $this->validatedDocuments($documents);
|
||||
if ($documents === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
$chunkUids = array_column($documents, 'chunk_uid');
|
||||
$this->index->ensureExists();
|
||||
$response = $this->index->bulkIndex($documents);
|
||||
$failedChunkUids = $this->failedChunkUids($response);
|
||||
|
||||
if ($failedChunkUids !== []) {
|
||||
$this->chunks->markFailed($failedChunkUids, 'OpenSearch bulk index returned item errors.', true);
|
||||
}
|
||||
|
||||
$indexedChunkUids = array_values(array_diff($chunkUids, $failedChunkUids));
|
||||
$this->chunks->markIndexed($indexedChunkUids);
|
||||
} catch (Throwable $exception) {
|
||||
$this->chunks->markFailed($chunkUids, $exception->getMessage(), true);
|
||||
throw $exception;
|
||||
}
|
||||
}
|
||||
|
||||
private function validatedDocuments(array $documents): array
|
||||
{
|
||||
$dimensions = (int) config('opensearch.vector.dimensions', 2048);
|
||||
$valid = [];
|
||||
foreach ($documents as $document) {
|
||||
if (($document['embedding_dimensions'] ?? 0) !== $dimensions) {
|
||||
$this->chunks->markFailed([$document['chunk_uid']], sprintf(
|
||||
'Chunk %s embedding dimension mismatch: expected %d, got %d.',
|
||||
$document['chunk_uid'] ?? '',
|
||||
$dimensions,
|
||||
$document['embedding_dimensions'] ?? 0
|
||||
), false);
|
||||
continue;
|
||||
}
|
||||
|
||||
$valid[] = $document;
|
||||
}
|
||||
|
||||
return $valid;
|
||||
}
|
||||
|
||||
private function failedChunkUids(array $response): array
|
||||
{
|
||||
if (($response['errors'] ?? false) !== true) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$failed = [];
|
||||
foreach ($response['items'] ?? [] as $item) {
|
||||
$result = $item['index'] ?? [];
|
||||
if (isset($result['error'])) {
|
||||
$failed[] = (string) ($result['_id'] ?? '');
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_filter($failed));
|
||||
}
|
||||
}
|
||||
157
app/service/Search/ChunkSearchIndexRepository.php
Normal file
157
app/service/Search/ChunkSearchIndexRepository.php
Normal file
@ -0,0 +1,157 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Search;
|
||||
|
||||
use app\service\Embedding\EmbeddingStatus;
|
||||
use support\Db;
|
||||
|
||||
class ChunkSearchIndexRepository
|
||||
{
|
||||
public function queuePendingArchiveTasks(int $limit): array
|
||||
{
|
||||
$statuses = [
|
||||
SearchIndexStatus::PENDING,
|
||||
SearchIndexStatus::QUEUED,
|
||||
SearchIndexStatus::INDEXING,
|
||||
SearchIndexStatus::FAILED_RETRYABLE,
|
||||
];
|
||||
|
||||
$archiveUids = Db::table('chunks')
|
||||
->where('embedding_status', EmbeddingStatus::EMBEDDED)
|
||||
->whereIn('search_index_status', $statuses)
|
||||
->select('archive_uid')
|
||||
->groupBy('archive_uid')
|
||||
->orderByRaw('MIN(id)')
|
||||
->limit($limit)
|
||||
->pluck('archive_uid')
|
||||
->all();
|
||||
|
||||
$archiveUids = array_values(array_filter(array_map('strval', $archiveUids)));
|
||||
foreach ($archiveUids as $archiveUid) {
|
||||
Db::table('chunks')
|
||||
->where('archive_uid', $archiveUid)
|
||||
->where('embedding_status', EmbeddingStatus::EMBEDDED)
|
||||
->whereIn('search_index_status', $statuses)
|
||||
->update([
|
||||
'search_index_status' => SearchIndexStatus::QUEUED,
|
||||
'search_index_error' => null,
|
||||
'search_index_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
|
||||
]);
|
||||
}
|
||||
|
||||
return $archiveUids;
|
||||
}
|
||||
|
||||
public function findQueuedDocuments(string $archiveUid, int $limit): array
|
||||
{
|
||||
$rows = Db::table('chunks')
|
||||
->join('archives', 'chunks.archive_uid', '=', 'archives.archive_uid')
|
||||
->where('chunks.archive_uid', $archiveUid)
|
||||
->where('chunks.embedding_status', EmbeddingStatus::EMBEDDED)
|
||||
->whereIn('chunks.search_index_status', [SearchIndexStatus::QUEUED, SearchIndexStatus::INDEXING])
|
||||
->orderBy('chunks.chunk_index')
|
||||
->limit($limit)
|
||||
->get([
|
||||
'chunks.chunk_uid',
|
||||
'chunks.archive_uid',
|
||||
'chunks.chunk_index',
|
||||
'chunks.page_start',
|
||||
'chunks.page_end',
|
||||
'chunks.text',
|
||||
'chunks.embedding_ref',
|
||||
'chunks.embedding_model',
|
||||
'chunks.created_time',
|
||||
'chunks.updated_time',
|
||||
'archives.title',
|
||||
'archives.source',
|
||||
'archives.author',
|
||||
'archives.year',
|
||||
'archives.series',
|
||||
'archives.tags',
|
||||
])
|
||||
->all();
|
||||
|
||||
return array_map(fn (object $row): array => $this->documentFromRow($row), $rows);
|
||||
}
|
||||
|
||||
public function markIndexing(array $chunkUids): void
|
||||
{
|
||||
$this->updateStatus($chunkUids, SearchIndexStatus::INDEXING, null);
|
||||
}
|
||||
|
||||
public function markIndexed(array $chunkUids): void
|
||||
{
|
||||
$this->updateStatus($chunkUids, SearchIndexStatus::INDEXED, null);
|
||||
}
|
||||
|
||||
public function markFailed(array $chunkUids, string $error, bool $retryable): void
|
||||
{
|
||||
$this->updateStatus(
|
||||
$chunkUids,
|
||||
$retryable ? SearchIndexStatus::FAILED_RETRYABLE : SearchIndexStatus::FAILED_TERMINAL,
|
||||
$error
|
||||
);
|
||||
}
|
||||
|
||||
private function documentFromRow(object $row): array
|
||||
{
|
||||
$embeddingRef = $this->decodeJson($row->embedding_ref ?? null, []);
|
||||
$embedding = is_array($embeddingRef['embedding'] ?? null) ? $embeddingRef['embedding'] : [];
|
||||
|
||||
return [
|
||||
'chunk_uid' => (string) $row->chunk_uid,
|
||||
'archive_uid' => (string) $row->archive_uid,
|
||||
'chunk_index' => (int) $row->chunk_index,
|
||||
'page_start' => $row->page_start === null ? null : (int) $row->page_start,
|
||||
'page_end' => $row->page_end === null ? null : (int) $row->page_end,
|
||||
'title' => $row->title,
|
||||
'source' => $row->source,
|
||||
'author' => $row->author,
|
||||
'year' => $row->year === null ? null : (int) $row->year,
|
||||
'series' => $row->series,
|
||||
'tags' => $this->decodeJson($row->tags ?? null, []),
|
||||
'text' => (string) $row->text,
|
||||
'embedding' => array_map('floatval', $embedding),
|
||||
'embedding_model' => (string) $row->embedding_model,
|
||||
'embedding_dimensions' => count($embedding),
|
||||
'created_time' => $this->dateString($row->created_time ?? null),
|
||||
'updated_time' => $this->dateString($row->updated_time ?? null),
|
||||
];
|
||||
}
|
||||
|
||||
private function updateStatus(array $chunkUids, int $status, ?string $error): void
|
||||
{
|
||||
if ($chunkUids === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
Db::table('chunks')->whereIn('chunk_uid', $chunkUids)->update([
|
||||
'search_index_status' => $status,
|
||||
'search_index_error' => $error === null ? null : mb_substr($error, 0, 4000),
|
||||
'search_index_updated_at' => Db::raw('CURRENT_TIMESTAMP'),
|
||||
]);
|
||||
}
|
||||
|
||||
private function decodeJson(mixed $value, array $fallback): array
|
||||
{
|
||||
if (is_array($value)) {
|
||||
return $value;
|
||||
}
|
||||
|
||||
if (!is_string($value) || trim($value) === '') {
|
||||
return $fallback;
|
||||
}
|
||||
|
||||
$decoded = json_decode($value, true);
|
||||
return is_array($decoded) ? $decoded : $fallback;
|
||||
}
|
||||
|
||||
private function dateString(mixed $value): ?string
|
||||
{
|
||||
if ($value === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return date(DATE_ATOM, strtotime((string) $value) ?: time());
|
||||
}
|
||||
}
|
||||
113
app/service/Search/OpenSearchChunkIndex.php
Normal file
113
app/service/Search/OpenSearchChunkIndex.php
Normal file
@ -0,0 +1,113 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Search;
|
||||
|
||||
use OpenSearch\Client;
|
||||
|
||||
class OpenSearchChunkIndex
|
||||
{
|
||||
public function __construct(private readonly ?Client $client = null)
|
||||
{
|
||||
}
|
||||
|
||||
public function ensureExists(): void
|
||||
{
|
||||
$client = $this->client();
|
||||
$index = $this->indexName();
|
||||
|
||||
if ($client->indices()->exists(['index' => $index])) {
|
||||
return;
|
||||
}
|
||||
|
||||
$client->indices()->create([
|
||||
'index' => $index,
|
||||
'body' => $this->mapping(),
|
||||
]);
|
||||
}
|
||||
|
||||
public function bulkIndex(array $documents): array
|
||||
{
|
||||
if ($documents === []) {
|
||||
return ['items' => [], 'errors' => false];
|
||||
}
|
||||
|
||||
$body = [];
|
||||
foreach ($documents as $document) {
|
||||
$body[] = [
|
||||
'index' => [
|
||||
'_index' => $this->indexName(),
|
||||
'_id' => $document['chunk_uid'],
|
||||
],
|
||||
];
|
||||
$body[] = $document;
|
||||
}
|
||||
|
||||
return $this->client()->bulk([
|
||||
'refresh' => config('opensearch.bulk.refresh', 'false'),
|
||||
'body' => $body,
|
||||
]);
|
||||
}
|
||||
|
||||
public function mapping(): array
|
||||
{
|
||||
return [
|
||||
'settings' => [
|
||||
'index' => [
|
||||
'knn' => true,
|
||||
],
|
||||
],
|
||||
'mappings' => [
|
||||
'properties' => [
|
||||
'chunk_uid' => ['type' => 'keyword'],
|
||||
'archive_uid' => ['type' => 'keyword'],
|
||||
'chunk_index' => ['type' => 'integer'],
|
||||
'page_start' => ['type' => 'integer'],
|
||||
'page_end' => ['type' => 'integer'],
|
||||
'title' => $this->textWithKeyword(),
|
||||
'source' => $this->textWithKeyword(),
|
||||
'author' => $this->textWithKeyword(),
|
||||
'year' => ['type' => 'integer'],
|
||||
'series' => $this->textWithKeyword(),
|
||||
'tags' => ['type' => 'keyword'],
|
||||
'text' => ['type' => 'text'],
|
||||
'embedding' => [
|
||||
'type' => 'knn_vector',
|
||||
'dimension' => (int) config('opensearch.vector.dimensions', 2048),
|
||||
'method' => [
|
||||
'name' => 'hnsw',
|
||||
'space_type' => config('opensearch.vector.space_type', 'cosinesimil'),
|
||||
'engine' => config('opensearch.vector.engine', 'lucene'),
|
||||
],
|
||||
],
|
||||
'embedding_model' => ['type' => 'keyword'],
|
||||
'embedding_dimensions' => ['type' => 'integer'],
|
||||
'created_time' => ['type' => 'date'],
|
||||
'updated_time' => ['type' => 'date'],
|
||||
],
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
private function client(): Client
|
||||
{
|
||||
return $this->client ?? (new OpenSearchClientFactory())->make();
|
||||
}
|
||||
|
||||
private function indexName(): string
|
||||
{
|
||||
return config('opensearch.indices.chunks', 'proofdb_chunks');
|
||||
}
|
||||
|
||||
private function textWithKeyword(): array
|
||||
{
|
||||
return [
|
||||
'type' => 'text',
|
||||
'fields' => [
|
||||
'keyword' => [
|
||||
'type' => 'keyword',
|
||||
'ignore_above' => 512,
|
||||
],
|
||||
],
|
||||
];
|
||||
}
|
||||
}
|
||||
32
app/service/Search/OpenSearchClientFactory.php
Normal file
32
app/service/Search/OpenSearchClientFactory.php
Normal file
@ -0,0 +1,32 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Search;
|
||||
|
||||
use OpenSearch\Client;
|
||||
use OpenSearch\ClientBuilder;
|
||||
|
||||
class OpenSearchClientFactory
|
||||
{
|
||||
public function make(?array $config = null): Client
|
||||
{
|
||||
$config = $config ?? config('opensearch.default', []);
|
||||
|
||||
$builder = ClientBuilder::create()
|
||||
->setHosts($config['hosts'] ?? ['http://127.0.0.1:9200'])
|
||||
->setSSLVerification((bool) ($config['ssl_verify'] ?? true))
|
||||
->setConnectionParams([
|
||||
'client' => [
|
||||
'timeout' => (float) ($config['timeout'] ?? 30),
|
||||
'connect_timeout' => (float) ($config['connect_timeout'] ?? 5),
|
||||
],
|
||||
]);
|
||||
|
||||
$username = trim((string) ($config['username'] ?? ''));
|
||||
$password = trim((string) ($config['password'] ?? ''));
|
||||
if ($username !== '' && $password !== '') {
|
||||
$builder->setBasicAuthentication($username, $password);
|
||||
}
|
||||
|
||||
return $builder->build();
|
||||
}
|
||||
}
|
||||
350
app/service/Search/OpenSearchSearchService.php
Normal file
350
app/service/Search/OpenSearchSearchService.php
Normal file
@ -0,0 +1,350 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Search;
|
||||
|
||||
use app\service\Embedding\BigModelEmbeddingClient;
|
||||
use InvalidArgumentException;
|
||||
use OpenSearch\Client;
|
||||
|
||||
class OpenSearchSearchService
|
||||
{
|
||||
private const DEFAULT_LIMIT = 10;
|
||||
private const MAX_LIMIT = 50;
|
||||
private const DEFAULT_RRF_K = 60;
|
||||
|
||||
public function __construct(
|
||||
private readonly ?Client $client = null,
|
||||
private readonly ?BigModelEmbeddingClient $embeddingClient = null,
|
||||
private readonly ?SearchKeywordService $keywordService = null
|
||||
) {
|
||||
}
|
||||
|
||||
public function fulltext(array $payload): array
|
||||
{
|
||||
$query = trim((string) ($payload['query'] ?? ''));
|
||||
if ($query === '') {
|
||||
throw new InvalidArgumentException('query is required.');
|
||||
}
|
||||
|
||||
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
|
||||
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
|
||||
$body = [
|
||||
'size' => $limit,
|
||||
'query' => [
|
||||
'bool' => [
|
||||
'must' => [
|
||||
[
|
||||
'multi_match' => [
|
||||
'query' => $query,
|
||||
'fields' => [
|
||||
'text^4',
|
||||
'title^3',
|
||||
'source^2',
|
||||
'author^2',
|
||||
'series^2',
|
||||
'tags^2',
|
||||
],
|
||||
'type' => 'best_fields',
|
||||
],
|
||||
],
|
||||
],
|
||||
'filter' => $this->filters($filters),
|
||||
],
|
||||
],
|
||||
'_source' => $this->sourceFields(),
|
||||
];
|
||||
|
||||
$response = $this->client()->search([
|
||||
'index' => config('opensearch.indices.chunks', 'proofdb_chunks'),
|
||||
'body' => $body,
|
||||
]);
|
||||
|
||||
return [
|
||||
'mode' => 'fulltext',
|
||||
'query' => $query,
|
||||
'limit' => $limit,
|
||||
'filters' => $filters,
|
||||
'total' => $this->total($response),
|
||||
'hits' => $this->hits($response),
|
||||
];
|
||||
}
|
||||
|
||||
public function vector(array $payload): array
|
||||
{
|
||||
$query = trim((string) ($payload['query'] ?? ''));
|
||||
if ($query === '') {
|
||||
throw new InvalidArgumentException('query is required.');
|
||||
}
|
||||
|
||||
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
|
||||
$k = $this->limit($payload['k'] ?? $limit);
|
||||
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
|
||||
$embedding = $this->queryEmbedding($query);
|
||||
|
||||
$response = $this->client()->search([
|
||||
'index' => config('opensearch.indices.chunks', 'proofdb_chunks'),
|
||||
'body' => [
|
||||
'size' => $limit,
|
||||
'query' => [
|
||||
'bool' => [
|
||||
'must' => [
|
||||
[
|
||||
'knn' => [
|
||||
'embedding' => [
|
||||
'vector' => $embedding,
|
||||
'k' => $k,
|
||||
],
|
||||
],
|
||||
],
|
||||
],
|
||||
'filter' => $this->filters($filters),
|
||||
],
|
||||
],
|
||||
'_source' => $this->sourceFields(),
|
||||
],
|
||||
]);
|
||||
|
||||
return [
|
||||
'mode' => 'vector',
|
||||
'query' => $query,
|
||||
'limit' => $limit,
|
||||
'k' => $k,
|
||||
'filters' => $filters,
|
||||
'embedding_model' => config('LLMapi.embedding.model', 'embedding-3'),
|
||||
'embedding_dimensions' => count($embedding),
|
||||
'total' => $this->total($response),
|
||||
'hits' => $this->hits($response),
|
||||
];
|
||||
}
|
||||
|
||||
public function hybrid(array $payload): array
|
||||
{
|
||||
$query = trim((string) ($payload['query'] ?? ''));
|
||||
if ($query === '') {
|
||||
throw new InvalidArgumentException('query is required.');
|
||||
}
|
||||
|
||||
$limit = $this->limit($payload['limit'] ?? self::DEFAULT_LIMIT);
|
||||
$candidateLimit = $this->limit($payload['candidate_limit'] ?? max($limit * 3, 20));
|
||||
$rrfK = max(1, (int) ($payload['rrf_k'] ?? self::DEFAULT_RRF_K));
|
||||
$filters = is_array($payload['filters'] ?? null) ? $payload['filters'] : [];
|
||||
$aiKeywords = null;
|
||||
$fulltextQuery = $query;
|
||||
if ($this->aiEnabled($payload)) {
|
||||
$aiKeywords = $this->keywordService()->generate($query);
|
||||
$fulltextQuery = trim((string) ($aiKeywords['query'] ?? '')) ?: $query;
|
||||
}
|
||||
|
||||
$basePayload = [
|
||||
'query' => $query,
|
||||
'limit' => $candidateLimit,
|
||||
'k' => $candidateLimit,
|
||||
'filters' => $filters,
|
||||
];
|
||||
$fulltextPayload = $basePayload;
|
||||
$fulltextPayload['query'] = $fulltextQuery;
|
||||
|
||||
$fulltext = $this->fulltext($fulltextPayload);
|
||||
$vector = $this->vector($basePayload);
|
||||
$hits = $this->rrf($fulltext['hits'], $vector['hits'], $rrfK);
|
||||
|
||||
return [
|
||||
'mode' => 'hybrid',
|
||||
'query' => $query,
|
||||
'limit' => $limit,
|
||||
'candidate_limit' => $candidateLimit,
|
||||
'rrf_k' => $rrfK,
|
||||
'filters' => $filters,
|
||||
'ai' => $aiKeywords !== null,
|
||||
'fulltext_query' => $fulltextQuery,
|
||||
'vector_query' => $query,
|
||||
'keywords' => $aiKeywords,
|
||||
'total' => count($hits),
|
||||
'hits' => array_slice($hits, 0, $limit),
|
||||
'sources' => [
|
||||
'fulltext_total' => $fulltext['total'],
|
||||
'vector_total' => $vector['total'],
|
||||
'fulltext_hits' => count($fulltext['hits']),
|
||||
'vector_hits' => count($vector['hits']),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
private function aiEnabled(array $payload): bool
|
||||
{
|
||||
return (bool) ($payload['ai'] ?? false) && (bool) config('LLMapi.search_keywords.enabled', true);
|
||||
}
|
||||
|
||||
private function filters(array $filters): array
|
||||
{
|
||||
$clauses = [];
|
||||
|
||||
foreach (['archive_uid', 'chunk_uid'] as $field) {
|
||||
if (!empty($filters[$field])) {
|
||||
$clauses[] = ['term' => [$field => (string) $filters[$field]]];
|
||||
}
|
||||
}
|
||||
|
||||
foreach (['source', 'author', 'series'] as $field) {
|
||||
if (!empty($filters[$field])) {
|
||||
$clauses[] = ['term' => [$field . '.keyword' => (string) $filters[$field]]];
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($filters['year']) && is_numeric($filters['year'])) {
|
||||
$clauses[] = ['term' => ['year' => (int) $filters['year']]];
|
||||
}
|
||||
|
||||
if (!empty($filters['tags'])) {
|
||||
$tags = is_array($filters['tags']) ? $filters['tags'] : [$filters['tags']];
|
||||
$tags = array_values(array_filter(array_map('strval', $tags)));
|
||||
if ($tags !== []) {
|
||||
$clauses[] = ['terms' => ['tags' => $tags]];
|
||||
}
|
||||
}
|
||||
|
||||
return $clauses;
|
||||
}
|
||||
|
||||
private function hits(array $response): array
|
||||
{
|
||||
$hits = [];
|
||||
foreach ($response['hits']['hits'] ?? [] as $hit) {
|
||||
$source = is_array($hit['_source'] ?? null) ? $hit['_source'] : [];
|
||||
$hits[] = [
|
||||
'score' => (float) ($hit['_score'] ?? 0),
|
||||
'chunk_uid' => $source['chunk_uid'] ?? null,
|
||||
'archive_uid' => $source['archive_uid'] ?? null,
|
||||
'chunk_index' => $source['chunk_index'] ?? null,
|
||||
'page_start' => $source['page_start'] ?? null,
|
||||
'page_end' => $source['page_end'] ?? null,
|
||||
'title' => $source['title'] ?? null,
|
||||
'source' => $source['source'] ?? null,
|
||||
'author' => $source['author'] ?? null,
|
||||
'year' => $source['year'] ?? null,
|
||||
'series' => $source['series'] ?? null,
|
||||
'tags' => $source['tags'] ?? [],
|
||||
'text' => $source['text'] ?? '',
|
||||
'embedding_model' => $source['embedding_model'] ?? null,
|
||||
'embedding_dimensions' => $source['embedding_dimensions'] ?? null,
|
||||
];
|
||||
}
|
||||
|
||||
return $hits;
|
||||
}
|
||||
|
||||
private function rrf(array $fulltextHits, array $vectorHits, int $rrfK): array
|
||||
{
|
||||
$merged = [];
|
||||
$this->mergeRankedHits($merged, $fulltextHits, 'fulltext', $rrfK);
|
||||
$this->mergeRankedHits($merged, $vectorHits, 'vector', $rrfK);
|
||||
|
||||
usort($merged, static function (array $a, array $b): int {
|
||||
$scoreCompare = ($b['hybrid_score'] ?? 0) <=> ($a['hybrid_score'] ?? 0);
|
||||
if ($scoreCompare !== 0) {
|
||||
return $scoreCompare;
|
||||
}
|
||||
|
||||
return ($b['score'] ?? 0) <=> ($a['score'] ?? 0);
|
||||
});
|
||||
|
||||
return array_values($merged);
|
||||
}
|
||||
|
||||
private function mergeRankedHits(array &$merged, array $hits, string $source, int $rrfK): void
|
||||
{
|
||||
foreach ($hits as $index => $hit) {
|
||||
$chunkUid = (string) ($hit['chunk_uid'] ?? '');
|
||||
if ($chunkUid === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$rank = $index + 1;
|
||||
$contribution = 1 / ($rrfK + $rank);
|
||||
if (!isset($merged[$chunkUid])) {
|
||||
$merged[$chunkUid] = $hit;
|
||||
$merged[$chunkUid]['score'] = 0.0;
|
||||
$merged[$chunkUid]['hybrid_score'] = 0.0;
|
||||
$merged[$chunkUid]['rank_sources'] = [];
|
||||
}
|
||||
|
||||
$merged[$chunkUid]['hybrid_score'] += $contribution;
|
||||
$merged[$chunkUid]['score'] = max((float) ($merged[$chunkUid]['score'] ?? 0), (float) ($hit['score'] ?? 0));
|
||||
$merged[$chunkUid]['rank_sources'][$source] = [
|
||||
'rank' => $rank,
|
||||
'score' => (float) ($hit['score'] ?? 0),
|
||||
'rrf' => $contribution,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
private function total(array $response): int
|
||||
{
|
||||
$total = $response['hits']['total'] ?? 0;
|
||||
if (is_array($total)) {
|
||||
return (int) ($total['value'] ?? 0);
|
||||
}
|
||||
|
||||
return (int) $total;
|
||||
}
|
||||
|
||||
private function limit(mixed $value): int
|
||||
{
|
||||
return min(self::MAX_LIMIT, max(1, (int) $value));
|
||||
}
|
||||
|
||||
private function queryEmbedding(string $query): array
|
||||
{
|
||||
$payload = $this->embeddingClient()->embed([$query], [
|
||||
'model' => config('LLMapi.embedding.model', 'embedding-3'),
|
||||
'dimensions' => config('LLMapi.embedding.dimensions', 2048),
|
||||
]);
|
||||
|
||||
$embedding = $payload['data'][0]['embedding'] ?? null;
|
||||
if (!is_array($embedding)) {
|
||||
throw new InvalidArgumentException('query embedding could not be generated.');
|
||||
}
|
||||
|
||||
$dimensions = (int) config('opensearch.vector.dimensions', 2048);
|
||||
if (count($embedding) !== $dimensions) {
|
||||
throw new InvalidArgumentException("query embedding dimensions must be {$dimensions}.");
|
||||
}
|
||||
|
||||
return array_map('floatval', $embedding);
|
||||
}
|
||||
|
||||
private function sourceFields(): array
|
||||
{
|
||||
return [
|
||||
'chunk_uid',
|
||||
'archive_uid',
|
||||
'chunk_index',
|
||||
'page_start',
|
||||
'page_end',
|
||||
'title',
|
||||
'source',
|
||||
'author',
|
||||
'year',
|
||||
'series',
|
||||
'tags',
|
||||
'text',
|
||||
'embedding_model',
|
||||
'embedding_dimensions',
|
||||
];
|
||||
}
|
||||
|
||||
private function client(): Client
|
||||
{
|
||||
return $this->client ?? (new OpenSearchClientFactory())->make();
|
||||
}
|
||||
|
||||
private function embeddingClient(): BigModelEmbeddingClient
|
||||
{
|
||||
return $this->embeddingClient ?? new BigModelEmbeddingClient();
|
||||
}
|
||||
|
||||
private function keywordService(): SearchKeywordService
|
||||
{
|
||||
return $this->keywordService ?? new SearchKeywordService();
|
||||
}
|
||||
}
|
||||
13
app/service/Search/SearchIndexStatus.php
Normal file
13
app/service/Search/SearchIndexStatus.php
Normal file
@ -0,0 +1,13 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Search;
|
||||
|
||||
class SearchIndexStatus
|
||||
{
|
||||
public const PENDING = 0;
|
||||
public const QUEUED = 1;
|
||||
public const INDEXING = 2;
|
||||
public const INDEXED = 3;
|
||||
public const FAILED_RETRYABLE = 4;
|
||||
public const FAILED_TERMINAL = 5;
|
||||
}
|
||||
115
app/service/Search/SearchKeywordService.php
Normal file
115
app/service/Search/SearchKeywordService.php
Normal file
@ -0,0 +1,115 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Search;
|
||||
|
||||
use app\service\LLM\LLMRetryQueue;
|
||||
use app\service\LLM\OpenAICompatibleClient;
|
||||
use Throwable;
|
||||
|
||||
class SearchKeywordService
|
||||
{
|
||||
private OpenAICompatibleClient $client;
|
||||
private LLMRetryQueue $retryQueue;
|
||||
|
||||
public function __construct(?OpenAICompatibleClient $client = null, ?LLMRetryQueue $retryQueue = null)
|
||||
{
|
||||
$this->client = $client ?? new OpenAICompatibleClient($this->clientConfig());
|
||||
$this->retryQueue = $retryQueue ?? new LLMRetryQueue();
|
||||
}
|
||||
|
||||
public function generate(string $query): array
|
||||
{
|
||||
if (!$this->client->isConfigured()) {
|
||||
return $this->fallback($query, 'LLM API is not configured.');
|
||||
}
|
||||
|
||||
try {
|
||||
$result = $this->retryQueue->run(
|
||||
fn (): array => $this->client->chatJson($this->messages($query), [
|
||||
'model' => config('LLMapi.search_keywords.model', config('LLMapi.metadata.model')),
|
||||
'temperature' => config('LLMapi.search_keywords.temperature', 0.1),
|
||||
'max_tokens' => config('LLMapi.search_keywords.max_tokens', 300),
|
||||
'stream' => false,
|
||||
'response_format' => config('LLMapi.search_keywords.response_format', ['type' => 'json_object']),
|
||||
'thinking' => config('LLMapi.search_keywords.thinking', ['type' => 'disabled']),
|
||||
'request_id' => 'search-keywords-' . substr(hash('sha256', $query), 0, 32),
|
||||
]),
|
||||
config('LLMapi.search_keywords.retry', config('LLMapi.metadata.retry', []))
|
||||
);
|
||||
} catch (Throwable $exception) {
|
||||
return $this->fallback($query, $exception->getMessage());
|
||||
}
|
||||
|
||||
$keywords = $this->keywords($result);
|
||||
if ($keywords === []) {
|
||||
return $this->fallback($query, 'LLM returned no usable keywords.');
|
||||
}
|
||||
|
||||
return [
|
||||
'enabled' => true,
|
||||
'attempted' => true,
|
||||
'error' => null,
|
||||
'keywords' => $keywords,
|
||||
'query' => implode(' ', $keywords),
|
||||
'model' => config('LLMapi.search_keywords.model', config('LLMapi.metadata.model')),
|
||||
];
|
||||
}
|
||||
|
||||
private function messages(string $query): array
|
||||
{
|
||||
return [
|
||||
[
|
||||
'role' => 'system',
|
||||
'content' => implode("\n", [
|
||||
'你是历史档案检索关键词生成助手。',
|
||||
'任务:把用户的自然语言问题改写为适合 BM25 全文检索的关键词。',
|
||||
'优先输出档案中可能出现的英文专名、政策名、事件名、人物名、地点名、缩写和年份。',
|
||||
'如果用户输入中文,请翻译或扩展为可能出现在英文档案中的关键词。',
|
||||
'只返回 JSON 对象,不要 Markdown,不要解释。',
|
||||
'JSON 格式:{"keywords":["keyword1","keyword2"],"query":"keyword1 keyword2"}。',
|
||||
'keywords 数量 3-12 个;不要编造过于具体而输入中没有依据的事实。',
|
||||
]),
|
||||
],
|
||||
[
|
||||
'role' => 'user',
|
||||
'content' => json_encode(['query' => $query], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
private function clientConfig(): array
|
||||
{
|
||||
$config = config('LLMapi.default', []);
|
||||
$config['timeout'] = config('LLMapi.search_keywords.timeout', 12);
|
||||
$config['connect_timeout'] = config('LLMapi.search_keywords.connect_timeout', 5);
|
||||
|
||||
return $config;
|
||||
}
|
||||
|
||||
private function keywords(array $result): array
|
||||
{
|
||||
$keywords = [];
|
||||
if (isset($result['keywords']) && is_array($result['keywords'])) {
|
||||
$keywords = $result['keywords'];
|
||||
} elseif (isset($result['query']) && is_string($result['query'])) {
|
||||
$keywords = preg_split('/\s+/', $result['query']) ?: [];
|
||||
}
|
||||
|
||||
return array_values(array_unique(array_filter(array_map(
|
||||
static fn (mixed $value): string => trim((string) $value),
|
||||
$keywords
|
||||
))));
|
||||
}
|
||||
|
||||
private function fallback(string $query, string $error): array
|
||||
{
|
||||
return [
|
||||
'enabled' => true,
|
||||
'attempted' => false,
|
||||
'error' => $error,
|
||||
'keywords' => [],
|
||||
'query' => $query,
|
||||
'model' => null,
|
||||
];
|
||||
}
|
||||
}
|
||||
126
app/service/Task/ProofDbTaskQueue.php
Normal file
126
app/service/Task/ProofDbTaskQueue.php
Normal file
@ -0,0 +1,126 @@
|
||||
<?php
|
||||
|
||||
namespace app\service\Task;
|
||||
|
||||
use support\Redis;
|
||||
|
||||
class ProofDbTaskQueue
|
||||
{
|
||||
public function push(array $task): void
|
||||
{
|
||||
Redis::lPush($this->pendingKey(), json_encode($task, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES));
|
||||
}
|
||||
|
||||
public function pop(int $timeout = 5): ?array
|
||||
{
|
||||
$result = Redis::brPop([$this->pendingKey()], $timeout);
|
||||
if (!is_array($result) || count($result) < 2) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$task = json_decode((string) $result[1], true);
|
||||
return is_array($task) ? $task : null;
|
||||
}
|
||||
|
||||
public function retryLater(array $task, string $error): void
|
||||
{
|
||||
$taskId = $this->taskId($task);
|
||||
$retryKey = $this->retryKey($taskId);
|
||||
$retryCount = (int) Redis::incr($retryKey);
|
||||
Redis::setEx($this->errorKey($taskId), 86400, $error);
|
||||
|
||||
$task['attempt'] = $retryCount + 1;
|
||||
if ($retryCount > $this->maxRetries()) {
|
||||
Redis::lPush($this->failedKey(), json_encode($task, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES));
|
||||
return;
|
||||
}
|
||||
|
||||
$delay = $this->baseDelaySeconds() * (2 ** max(0, $retryCount - 1));
|
||||
Redis::zAdd($this->delayedKey(), time() + $delay, json_encode($task, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES));
|
||||
}
|
||||
|
||||
public function releaseDueDelayed(): int
|
||||
{
|
||||
$items = Redis::zRangeByScore($this->delayedKey(), '-inf', (string) time(), ['limit' => [0, 100]]);
|
||||
if (!is_array($items) || $items === []) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
foreach ($items as $task) {
|
||||
Redis::zRem($this->delayedKey(), $task);
|
||||
Redis::lPush($this->pendingKey(), $task);
|
||||
}
|
||||
|
||||
return count($items);
|
||||
}
|
||||
|
||||
public function clearRetry(array $task): void
|
||||
{
|
||||
$taskId = $this->taskId($task);
|
||||
Redis::del($this->retryKey($taskId), $this->errorKey($taskId));
|
||||
}
|
||||
|
||||
public function blockTimeout(): int
|
||||
{
|
||||
return (int) config('queue.tasks.block_timeout', 5);
|
||||
}
|
||||
|
||||
public function idleSleepSeconds(): int
|
||||
{
|
||||
return max(1, (int) config('queue.tasks.idle_sleep_seconds', 1));
|
||||
}
|
||||
|
||||
public function dispatcherIntervalSeconds(): int
|
||||
{
|
||||
return max(1, (int) config('queue.tasks.dispatcher_interval_seconds', 15));
|
||||
}
|
||||
|
||||
public function dispatcherBatchSize(): int
|
||||
{
|
||||
return max(1, (int) config('queue.tasks.dispatcher_batch_size', 20));
|
||||
}
|
||||
|
||||
private function taskId(array $task): string
|
||||
{
|
||||
return implode(':', [
|
||||
(string) ($task['task_type'] ?? 'unknown'),
|
||||
(string) ($task['target_type'] ?? 'unknown'),
|
||||
(string) ($task['target_uid'] ?? 'unknown'),
|
||||
]);
|
||||
}
|
||||
|
||||
private function pendingKey(): string
|
||||
{
|
||||
return config('queue.tasks.pending', 'proofdb:tasks:pending');
|
||||
}
|
||||
|
||||
private function delayedKey(): string
|
||||
{
|
||||
return config('queue.tasks.delayed', 'proofdb:tasks:delayed');
|
||||
}
|
||||
|
||||
private function failedKey(): string
|
||||
{
|
||||
return config('queue.tasks.failed', 'proofdb:tasks:failed');
|
||||
}
|
||||
|
||||
private function retryKey(string $taskId): string
|
||||
{
|
||||
return config('queue.tasks.retry_prefix', 'proofdb:tasks:retry:') . $taskId;
|
||||
}
|
||||
|
||||
private function errorKey(string $taskId): string
|
||||
{
|
||||
return config('queue.tasks.error_prefix', 'proofdb:tasks:error:') . $taskId;
|
||||
}
|
||||
|
||||
private function maxRetries(): int
|
||||
{
|
||||
return (int) config('queue.tasks.max_retries', 5);
|
||||
}
|
||||
|
||||
private function baseDelaySeconds(): int
|
||||
{
|
||||
return (int) config('queue.tasks.base_delay_seconds', 60);
|
||||
}
|
||||
}
|
||||
@ -9,6 +9,8 @@ return [
|
||||
* - LLM_API_KEY
|
||||
* - LLM_CHAT_MODEL
|
||||
* - LLM_EMBEDDING_MODEL
|
||||
* - BIGMODEL_API_BASE_URL
|
||||
* - BIGMODEL_API_KEY
|
||||
*/
|
||||
'default' => [
|
||||
'base_url' => getenv('LLM_API_BASE_URL') ?: 'https://api.openai.com/v1',
|
||||
@ -48,10 +50,43 @@ return [
|
||||
],
|
||||
|
||||
'embedding' => [
|
||||
'model' => getenv('LLM_EMBEDDING_MODEL') ?: 'text-embedding-3-small',
|
||||
'batch_size' => (int) (getenv('LLM_EMBEDDING_BATCH_SIZE') ?: 64),
|
||||
'base_url' => getenv('BIGMODEL_API_BASE_URL') ?: 'https://open.bigmodel.cn/api/paas/v4',
|
||||
'api_key' => getenv('BIGMODEL_API_KEY') ?: (getenv('LLM_EMBEDDING_API_KEY') ?: getenv('LLM_API_KEY')),
|
||||
'model' => getenv('LLM_EMBEDDING_MODEL') ?: 'embedding-3',
|
||||
'batch_size' => min(64, max(1, (int) (getenv('LLM_EMBEDDING_BATCH_SIZE') ?: 32))),
|
||||
'dimensions' => getenv('LLM_EMBEDDING_DIMENSIONS') !== false
|
||||
? (int) getenv('LLM_EMBEDDING_DIMENSIONS')
|
||||
: null,
|
||||
: 2048,
|
||||
'timeout' => (int) (getenv('LLM_EMBEDDING_TIMEOUT') ?: 60),
|
||||
'connect_timeout' => (int) (getenv('LLM_EMBEDDING_CONNECT_TIMEOUT') ?: 10),
|
||||
'retry' => [
|
||||
'enabled' => (getenv('LLM_EMBEDDING_RETRY_ENABLED') ?: 'true') !== 'false',
|
||||
'max_attempts' => (int) (getenv('LLM_EMBEDDING_RETRY_MAX_ATTEMPTS') ?: 3),
|
||||
'base_delay_ms' => (int) (getenv('LLM_EMBEDDING_RETRY_BASE_DELAY_MS') ?: 1500),
|
||||
'max_delay_ms' => (int) (getenv('LLM_EMBEDDING_RETRY_MAX_DELAY_MS') ?: 10000),
|
||||
'retry_statuses' => [429, 500, 502, 503, 504],
|
||||
'retry_error_codes' => ['1302', '1303', '1304', '1305', '1306', '1307', '1308'],
|
||||
],
|
||||
],
|
||||
|
||||
'search_keywords' => [
|
||||
'enabled' => (getenv('LLM_SEARCH_KEYWORDS_ENABLED') ?: 'true') !== 'false',
|
||||
'model' => getenv('LLM_SEARCH_KEYWORDS_MODEL') ?: (getenv('LLM_METADATA_MODEL') ?: (getenv('LLM_CHAT_MODEL') ?: 'gpt-4.1-mini')),
|
||||
'temperature' => (float) (getenv('LLM_SEARCH_KEYWORDS_TEMPERATURE') ?: 0.1),
|
||||
'max_tokens' => (int) (getenv('LLM_SEARCH_KEYWORDS_MAX_TOKENS') ?: 1920),
|
||||
'timeout' => (int) (getenv('LLM_SEARCH_KEYWORDS_TIMEOUT') ?: 12),
|
||||
'connect_timeout' => (int) (getenv('LLM_SEARCH_KEYWORDS_CONNECT_TIMEOUT') ?: 5),
|
||||
'response_format' => ['type' => 'json_object'],
|
||||
'thinking' => [
|
||||
'type' => getenv('LLM_SEARCH_KEYWORDS_THINKING') ?: 'disabled',
|
||||
],
|
||||
'retry' => [
|
||||
'enabled' => (getenv('LLM_SEARCH_KEYWORDS_RETRY_ENABLED') ?: 'true') !== 'false',
|
||||
'max_attempts' => (int) (getenv('LLM_SEARCH_KEYWORDS_RETRY_MAX_ATTEMPTS') ?: 3),
|
||||
'base_delay_ms' => (int) (getenv('LLM_SEARCH_KEYWORDS_RETRY_BASE_DELAY_MS') ?: 1500),
|
||||
'max_delay_ms' => (int) (getenv('LLM_SEARCH_KEYWORDS_RETRY_MAX_DELAY_MS') ?: 10000),
|
||||
'retry_statuses' => [429],
|
||||
'retry_error_codes' => ['1302', '1303', '1304', '1305', '1306', '1307', '1308'],
|
||||
],
|
||||
],
|
||||
];
|
||||
|
||||
@ -40,4 +40,10 @@ return [
|
||||
'refresh' => getenv('OPENSEARCH_BULK_REFRESH') ?: 'false',
|
||||
'chunk_size' => (int) (getenv('OPENSEARCH_BULK_CHUNK_SIZE') ?: 500),
|
||||
],
|
||||
|
||||
'vector' => [
|
||||
'dimensions' => (int) (getenv('OPENSEARCH_VECTOR_DIMENSIONS') ?: 2048),
|
||||
'space_type' => getenv('OPENSEARCH_VECTOR_SPACE_TYPE') ?: 'cosinesimil',
|
||||
'engine' => getenv('OPENSEARCH_VECTOR_ENGINE') ?: 'lucene',
|
||||
],
|
||||
];
|
||||
|
||||
@ -64,5 +64,17 @@ return [
|
||||
'count' => 1,
|
||||
'reloadable' => true,
|
||||
'constructor' => []
|
||||
],
|
||||
'proofdb_task_dispatcher' => [
|
||||
'handler' => app\process\ProofDbTaskDispatcher::class,
|
||||
'count' => 1,
|
||||
'reloadable' => true,
|
||||
'constructor' => []
|
||||
],
|
||||
'proofdb_task_worker' => [
|
||||
'handler' => app\process\ProofDbTaskWorker::class,
|
||||
'count' => 1,
|
||||
'reloadable' => true,
|
||||
'constructor' => []
|
||||
]
|
||||
];
|
||||
|
||||
@ -12,4 +12,17 @@ return [
|
||||
'block_timeout' => (int) (getenv('AI_METADATA_QUEUE_BLOCK_TIMEOUT') ?: 5),
|
||||
'idle_sleep_seconds' => (int) (getenv('AI_METADATA_QUEUE_IDLE_SLEEP_SECONDS') ?: 1),
|
||||
],
|
||||
'tasks' => [
|
||||
'pending' => 'proofdb:tasks:pending',
|
||||
'delayed' => 'proofdb:tasks:delayed',
|
||||
'failed' => 'proofdb:tasks:failed',
|
||||
'retry_prefix' => 'proofdb:tasks:retry:',
|
||||
'error_prefix' => 'proofdb:tasks:error:',
|
||||
'max_retries' => (int) (getenv('PROOFDB_TASK_QUEUE_MAX_RETRIES') ?: 5),
|
||||
'base_delay_seconds' => (int) (getenv('PROOFDB_TASK_QUEUE_BASE_DELAY_SECONDS') ?: 60),
|
||||
'block_timeout' => (int) (getenv('PROOFDB_TASK_QUEUE_BLOCK_TIMEOUT') ?: 5),
|
||||
'idle_sleep_seconds' => (int) (getenv('PROOFDB_TASK_QUEUE_IDLE_SLEEP_SECONDS') ?: 1),
|
||||
'dispatcher_interval_seconds' => (int) (getenv('PROOFDB_TASK_DISPATCHER_INTERVAL_SECONDS') ?: 15),
|
||||
'dispatcher_batch_size' => (int) (getenv('PROOFDB_TASK_DISPATCHER_BATCH_SIZE') ?: 20),
|
||||
],
|
||||
];
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
|
||||
return [
|
||||
'default' => [
|
||||
'password' => 'qi2005112',
|
||||
'password' => 'proofdb',
|
||||
'host' => '127.0.0.1',
|
||||
'port' => 6379,
|
||||
'database' => 0,
|
||||
|
||||
@ -15,7 +15,7 @@
|
||||
use Webman\Route;
|
||||
|
||||
Route::post('/api/articles/import', [app\controller\Api\ArticleImportController::class, 'import']);
|
||||
|
||||
|
||||
|
||||
Route::post('/api/search/fulltext', [app\controller\Api\SearchController::class, 'fulltext']);
|
||||
Route::post('/api/search/vector', [app\controller\Api\SearchController::class, 'vector']);
|
||||
Route::post('/api/search/hybrid', [app\controller\Api\SearchController::class, 'hybrid']);
|
||||
|
||||
|
||||
@ -1,11 +0,0 @@
|
||||
version: "3"
|
||||
services:
|
||||
webman:
|
||||
build: .
|
||||
container_name: docker-webman
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- "./:/app"
|
||||
ports:
|
||||
- "8787:8787"
|
||||
command: ["php", "start.php", "start" ]
|
||||
71
readme.md
71
readme.md
@ -241,43 +241,84 @@ GET /api/evidence/{chunk_uid}
|
||||
- [x] Metadata enrichment service can request/fill `title`, `year`, `author`, `tags`, and `summary` when LLM config is available.
|
||||
- [x] LLM retry helper exists for retryable HTTP/provider errors.
|
||||
- [x] Import API documentation exists in `apidoc/importapi.md`.
|
||||
- [x] BigModel/Zhipu `embedding-3` client is implemented and verified with a live 256-dimension smoke test.
|
||||
- [x] Generic async task queue/process foundation exists: one DB dispatcher process plus one Redis worker process.
|
||||
- [x] OpenSearch client factory is implemented and supports passwordless local OpenSearch when security is disabled.
|
||||
- [x] OpenSearch `proofdb_chunks` hybrid index mapping exists with BM25 text fields and a 2048-dimension `knn_vector` embedding field.
|
||||
- [x] OpenSearch search-index task handler is implemented and writes embedded chunks through bulk upsert.
|
||||
- [x] End-to-end embedding-to-OpenSearch smoke test passed for 14 chunks: all are `embedding_status=embedded`, `search_index_status=indexed`, and OpenSearch documents contain 2048-dimension vectors.
|
||||
- [x] Full-text search service, route, controller, and external API documentation are implemented for `POST /api/search/fulltext`.
|
||||
- [x] Full-text OpenSearch smoke test passed with `query="policy documents"`, returning 12 total hits from indexed chunks.
|
||||
- [x] Vector search service, route, controller, and external API documentation are implemented for `POST /api/search/vector`.
|
||||
- [x] Vector OpenSearch smoke test passed with English and Chinese queries. Chinese query `伊拉克入侵科威特与沙漠风暴` correctly recalled the Iraq/Kuwait/Desert Storm chunk as top hit.
|
||||
- [x] Hybrid search service, route, controller, and external API documentation are implemented for `POST /api/search/hybrid` using Reciprocal Rank Fusion over full-text and vector candidates.
|
||||
- [x] Hybrid smoke tests passed: English query combines fulltext/vector ranks, and Chinese query falls back to vector recall with the Iraq/Kuwait/Desert Storm chunk as top hit.
|
||||
- [x] Hybrid search supports `ai=true`: the original query is used for vector search, while the full-text query is rewritten into BM25 keywords through the existing OpenAI-compatible LLM chat path. Keyword generation has a shorter timeout and falls back to the original query on failure.
|
||||
|
||||
### Partially Done
|
||||
|
||||
- [ ] Archive/Page/Chunk model is partly persisted: `archives` and `chunks` tables exist, but pages/page blocks are only summarized in import output and snapshots, not stored as first-class relational tables.
|
||||
- [ ] `embedding_status`, `embedding_ref`, and `embedding_model` fields exist, but no embedding generation or vector index write path exists yet.
|
||||
- [ ] `embedding_status`, `embedding_ref`, `embedding_model`, `embedding_error`, and `embedding_updated_at` fields exist; embedding generation into PostgreSQL JSONB and OpenSearch vector indexing are implemented, but vector retrieval API is not implemented yet.
|
||||
- [ ] `search_index_status`, `search_index_error`, and `search_index_updated_at` fields exist and are used by the generic task dispatcher/worker.
|
||||
- [ ] Import response exposes page summaries and chunk IDs, but there is no read API yet to fetch archive, page, or chunk records after import.
|
||||
- [ ] AI metadata enrichment updates the archive row, but import-time response only reports the queue state; clients need a follow-up API or polling path to observe completed enrichment.
|
||||
- [ ] API documentation still contains an old "后续接入 MySQL" phrase; update it to PostgreSQL to match the database decision.
|
||||
- [ ] Database and Redis credentials are hard-coded in config files; move them to environment variables before production use.
|
||||
|
||||
### Async Task Contract
|
||||
|
||||
The search/vector pipeline should use two generic background processes instead of one process per task family:
|
||||
|
||||
```text
|
||||
ProofDbTaskDispatcher
|
||||
-> periodically scans PostgreSQL for unfinished work
|
||||
-> marks eligible rows as queued
|
||||
-> pushes normalized task payloads into Redis
|
||||
|
||||
ProofDbTaskWorker
|
||||
-> consumes Redis task payloads
|
||||
-> dispatches by task_type to handlers
|
||||
-> updates PostgreSQL status after success/failure
|
||||
```
|
||||
|
||||
Task payload shape:
|
||||
|
||||
```json
|
||||
{
|
||||
"task_type": "search_index",
|
||||
"target_type": "archive",
|
||||
"target_uid": "01...",
|
||||
"attempt": 1
|
||||
}
|
||||
```
|
||||
|
||||
Initial task types:
|
||||
|
||||
- `search_index`: enqueue records where `search_index_status != indexed`; handler writes chunks to OpenSearch.
|
||||
- `embedding`: enqueue records where `embedding_status in pending, queued, failed_retryable`; handler calls BigModel/Zhipu `embedding-3` and writes embedding references.
|
||||
|
||||
Redis tasks may be duplicated or lost; PostgreSQL status is the recovery source of truth. Task handlers must be idempotent around `archive_uid` / `chunk_uid`.
|
||||
|
||||
### Not Done
|
||||
|
||||
- [ ] OpenSearch integration is not implemented.
|
||||
- [ ] Full-text indexing of chunks is not implemented.
|
||||
- [ ] Full-text search API is not implemented: `POST /api/search/fulltext`.
|
||||
- [ ] Embedding API/client for vector generation is not implemented.
|
||||
- [ ] Vector database integration is not implemented, neither OpenSearch kNN nor Qdrant.
|
||||
- [ ] Vector search API is not implemented: `POST /api/search/vector`.
|
||||
- [ ] Hybrid search fusion/rerank is not implemented: `POST /api/search/hybrid`.
|
||||
- [ ] Evidence reconstruction API is not implemented: `GET /api/evidence/{chunk_uid}`.
|
||||
- [ ] Chunk detail API is not implemented: `GET /api/chunks/{chunk_uid}`.
|
||||
- [ ] Page-level citation reconstruction is not implemented beyond storing `page_start` and `page_end` on chunks.
|
||||
- [ ] OpenSearch/Vector schema, index mappings, and migration/setup scripts are not present.
|
||||
- [ ] Background worker for embedding pending chunks is not present.
|
||||
- [ ] Reindex/re-embed maintenance commands are not present.
|
||||
- [ ] Reindex maintenance should detect/recover OpenSearch index loss or stale `search_index_status=indexed` rows when the index has been recreated.
|
||||
- [ ] Request validation is handwritten in the service; no dedicated validator classes or reusable validation layer are present.
|
||||
- [ ] Automated tests for Markdown parsing, chunking, import persistence, queue behavior, and metadata enrichment are not present.
|
||||
- [ ] API authentication, rate limiting, and admin controls are not present.
|
||||
- [ ] Observability for import/search/enrichment jobs is minimal; no structured job metrics or admin status endpoints are present.
|
||||
- [ ] Default index page/view still uses Webman starter content and is not Proof DB specific.
|
||||
|
||||
### Future Optimizations
|
||||
|
||||
- [ ] Extend full-text search from single `query` string over `multi_match` fields to multi-query bool search, for example `queries: ["Iraq Kuwait", "Desert Storm", "policy documents"]` mapped to OpenSearch `bool.should`.
|
||||
|
||||
### Next Build Order
|
||||
|
||||
1. Normalize remaining API documentation wording from MySQL to PostgreSQL.
|
||||
2. Add read APIs for archives/chunks/evidence so imported data can be verified without reading snapshots or the database directly.
|
||||
3. Add focused tests for DOCMASTER page parsing, noise filtering, comment coalescing, chunk UID stability, and repository persistence.
|
||||
4. Implement embedding generation worker and persist `embedding_ref`/`embedding_model`.
|
||||
5. Add OpenSearch full-text indexing and `POST /api/search/fulltext`.
|
||||
6. Add vector backend choice and `POST /api/search/vector`.
|
||||
7. Implement hybrid fusion/rerank and citation-oriented evidence reconstruction.
|
||||
4. Add async task foundation: task statuses, Redis task payload format, generic DB dispatcher process, and generic Redis worker process. (Done for embedding and OpenSearch indexing)
|
||||
5. Add chunk detail API and evidence reconstruction API.
|
||||
|
||||
@ -41,6 +41,8 @@ CREATE TABLE IF NOT EXISTS chunks (
|
||||
embedding_status INTEGER NOT NULL DEFAULT 0,
|
||||
embedding_ref JSONB,
|
||||
embedding_model TEXT,
|
||||
created_time TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_time TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
CONSTRAINT chunks_archive_uid_foreign
|
||||
FOREIGN KEY (archive_uid)
|
||||
REFERENCES archives (archive_uid)
|
||||
@ -53,9 +55,17 @@ SQL,
|
||||
'CREATE INDEX IF NOT EXISTS archives_series_index ON archives (series)',
|
||||
'CREATE INDEX IF NOT EXISTS archives_tags_gin_index ON archives USING GIN (tags)',
|
||||
'CREATE INDEX IF NOT EXISTS archives_metadata_gin_index ON archives USING GIN (metadata)',
|
||||
'ALTER TABLE chunks ADD COLUMN IF NOT EXISTS embedding_error TEXT',
|
||||
'ALTER TABLE chunks ADD COLUMN IF NOT EXISTS embedding_updated_at TIMESTAMPTZ',
|
||||
'ALTER TABLE chunks ADD COLUMN IF NOT EXISTS search_index_status INTEGER NOT NULL DEFAULT 0',
|
||||
'ALTER TABLE chunks ADD COLUMN IF NOT EXISTS search_index_error TEXT',
|
||||
'ALTER TABLE chunks ADD COLUMN IF NOT EXISTS search_index_updated_at TIMESTAMPTZ',
|
||||
'ALTER TABLE chunks ADD COLUMN IF NOT EXISTS created_time TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP',
|
||||
'ALTER TABLE chunks ADD COLUMN IF NOT EXISTS updated_time TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP',
|
||||
'CREATE INDEX IF NOT EXISTS chunks_archive_uid_index ON chunks (archive_uid)',
|
||||
'CREATE INDEX IF NOT EXISTS chunks_page_range_index ON chunks (archive_uid, page_start, page_end)',
|
||||
'CREATE INDEX IF NOT EXISTS chunks_embedding_status_index ON chunks (embedding_status)',
|
||||
'CREATE INDEX IF NOT EXISTS chunks_search_index_status_index ON chunks (search_index_status)',
|
||||
<<<SQL
|
||||
CREATE OR REPLACE FUNCTION set_updated_time()
|
||||
RETURNS TRIGGER AS $$
|
||||
@ -66,11 +76,18 @@ END;
|
||||
$$ LANGUAGE plpgsql
|
||||
SQL,
|
||||
'DROP TRIGGER IF EXISTS archives_set_updated_time ON archives',
|
||||
'DROP TRIGGER IF EXISTS chunks_set_updated_time ON chunks',
|
||||
<<<SQL
|
||||
CREATE TRIGGER archives_set_updated_time
|
||||
BEFORE UPDATE ON archives
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION set_updated_time()
|
||||
SQL,
|
||||
<<<SQL
|
||||
CREATE TRIGGER chunks_set_updated_time
|
||||
BEFORE UPDATE ON chunks
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION set_updated_time()
|
||||
SQL,
|
||||
];
|
||||
|
||||
|
||||
18
scripts/setup_opensearch.php
Normal file
18
scripts/setup_opensearch.php
Normal file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env php
|
||||
<?php
|
||||
|
||||
use app\service\Search\OpenSearchChunkIndex;
|
||||
|
||||
require __DIR__ . '/../vendor/autoload.php';
|
||||
require __DIR__ . '/../support/bootstrap.php';
|
||||
|
||||
try {
|
||||
$index = new OpenSearchChunkIndex();
|
||||
$index->ensureExists();
|
||||
|
||||
echo 'OpenSearch chunk index initialized: ' . config('opensearch.indices.chunks', 'proofdb_chunks') . PHP_EOL;
|
||||
echo 'Vector dimensions: ' . config('opensearch.vector.dimensions', 2048) . PHP_EOL;
|
||||
} catch (Throwable $exception) {
|
||||
fwrite(STDERR, $exception::class . ': ' . $exception->getMessage() . PHP_EOL);
|
||||
exit(1);
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user