mirror of
https://github.com/zebrajr/ollama-webui.git
synced 2025-12-05 12:20:26 +01:00
Merge pull request #18306 from palazski/main
feat: add mineru as document parser backend with support of both local and managed api
This commit is contained in:
commit
e8c1dbb2da
|
|
@ -2297,6 +2297,36 @@ DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
|
|||
os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"),
|
||||
)
|
||||
|
||||
MINERU_API_MODE = PersistentConfig(
|
||||
"MINERU_API_MODE",
|
||||
"rag.mineru_api_mode",
|
||||
os.environ.get("MINERU_API_MODE", "local"), # "local" or "cloud"
|
||||
)
|
||||
|
||||
MINERU_API_URL = PersistentConfig(
|
||||
"MINERU_API_URL",
|
||||
"rag.mineru_api_url",
|
||||
os.environ.get("MINERU_API_URL", "http://localhost:8000"),
|
||||
)
|
||||
|
||||
MINERU_API_KEY = PersistentConfig(
|
||||
"MINERU_API_KEY",
|
||||
"rag.mineru_api_key",
|
||||
os.environ.get("MINERU_API_KEY", ""),
|
||||
)
|
||||
|
||||
mineru_params = os.getenv("MINERU_PARAMS", "")
|
||||
try:
|
||||
mineru_params = json.loads(mineru_params)
|
||||
except json.JSONDecodeError:
|
||||
mineru_params = {}
|
||||
|
||||
MINERU_PARAMS = PersistentConfig(
|
||||
"MINERU_PARAMS",
|
||||
"rag.mineru_params",
|
||||
mineru_params,
|
||||
)
|
||||
|
||||
EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig(
|
||||
"EXTERNAL_DOCUMENT_LOADER_URL",
|
||||
"rag.external_document_loader_url",
|
||||
|
|
|
|||
|
|
@ -243,6 +243,10 @@ from open_webui.config import (
|
|||
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
|
||||
DATALAB_MARKER_FORMAT_LINES,
|
||||
DATALAB_MARKER_OUTPUT_FORMAT,
|
||||
MINERU_API_MODE,
|
||||
MINERU_API_URL,
|
||||
MINERU_API_KEY,
|
||||
MINERU_PARAMS,
|
||||
DATALAB_MARKER_USE_LLM,
|
||||
EXTERNAL_DOCUMENT_LOADER_URL,
|
||||
EXTERNAL_DOCUMENT_LOADER_API_KEY,
|
||||
|
|
@ -853,6 +857,10 @@ app.state.config.DOCLING_PICTURE_DESCRIPTION_API = DOCLING_PICTURE_DESCRIPTION_A
|
|||
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
|
||||
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
||||
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
|
||||
app.state.config.MINERU_API_MODE = MINERU_API_MODE
|
||||
app.state.config.MINERU_API_URL = MINERU_API_URL
|
||||
app.state.config.MINERU_API_KEY = MINERU_API_KEY
|
||||
app.state.config.MINERU_PARAMS = MINERU_PARAMS
|
||||
|
||||
app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
|
||||
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ from open_webui.retrieval.loaders.external_document import ExternalDocumentLoade
|
|||
|
||||
from open_webui.retrieval.loaders.mistral import MistralLoader
|
||||
from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader
|
||||
from open_webui.retrieval.loaders.mineru import MinerULoader
|
||||
|
||||
|
||||
from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL
|
||||
|
|
@ -367,6 +368,22 @@ class Loader:
|
|||
api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
|
||||
azure_credential=DefaultAzureCredential(),
|
||||
)
|
||||
elif self.engine == "mineru" and file_ext in [
|
||||
"pdf",
|
||||
"doc",
|
||||
"docx",
|
||||
"ppt",
|
||||
"pptx",
|
||||
"xls",
|
||||
"xlsx",
|
||||
]:
|
||||
loader = MinerULoader(
|
||||
file_path=file_path,
|
||||
api_mode=self.kwargs.get("MINERU_API_MODE", "local"),
|
||||
api_url=self.kwargs.get("MINERU_API_URL", "http://localhost:8000"),
|
||||
api_key=self.kwargs.get("MINERU_API_KEY", ""),
|
||||
params=self.kwargs.get("MINERU_PARAMS", {}),
|
||||
)
|
||||
elif (
|
||||
self.engine == "mistral_ocr"
|
||||
and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""
|
||||
|
|
|
|||
541
backend/open_webui/retrieval/loaders/mineru.py
Normal file
541
backend/open_webui/retrieval/loaders/mineru.py
Normal file
|
|
@ -0,0 +1,541 @@
|
|||
import os
|
||||
import time
|
||||
import requests
|
||||
import logging
|
||||
import tempfile
|
||||
import zipfile
|
||||
from typing import List, Optional
|
||||
from langchain_core.documents import Document
|
||||
from fastapi import HTTPException, status
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MinerULoader:
|
||||
"""
|
||||
MinerU document parser loader supporting both Cloud API and Local API modes.
|
||||
|
||||
Cloud API: Uses MinerU managed service with async task-based processing
|
||||
Local API: Uses self-hosted MinerU API with synchronous processing
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
api_mode: str = "local",
|
||||
api_url: str = "http://localhost:8000",
|
||||
api_key: str = "",
|
||||
params: dict = None,
|
||||
):
|
||||
self.file_path = file_path
|
||||
self.api_mode = api_mode.lower()
|
||||
self.api_url = api_url.rstrip("/")
|
||||
self.api_key = api_key
|
||||
|
||||
# Parse params dict with defaults
|
||||
params = params or {}
|
||||
self.enable_ocr = params.get("enable_ocr", False)
|
||||
self.enable_formula = params.get("enable_formula", True)
|
||||
self.enable_table = params.get("enable_table", True)
|
||||
self.language = params.get("language", "en")
|
||||
self.model_version = params.get("model_version", "pipeline")
|
||||
self.page_ranges = params.get("page_ranges", "")
|
||||
|
||||
# Validate API mode
|
||||
if self.api_mode not in ["local", "cloud"]:
|
||||
raise ValueError(
|
||||
f"Invalid API mode: {self.api_mode}. Must be 'local' or 'cloud'"
|
||||
)
|
||||
|
||||
# Validate Cloud API requirements
|
||||
if self.api_mode == "cloud" and not self.api_key:
|
||||
raise ValueError("API key is required for Cloud API mode")
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""
|
||||
Main entry point for loading and parsing the document.
|
||||
Routes to Cloud or Local API based on api_mode.
|
||||
"""
|
||||
try:
|
||||
if self.api_mode == "cloud":
|
||||
return self._load_cloud_api()
|
||||
else:
|
||||
return self._load_local_api()
|
||||
except Exception as e:
|
||||
log.error(f"Error loading document with MinerU: {e}")
|
||||
raise
|
||||
|
||||
def _load_local_api(self) -> List[Document]:
|
||||
"""
|
||||
Load document using Local API (synchronous).
|
||||
Posts file to /file_parse endpoint and gets immediate response.
|
||||
"""
|
||||
log.info(f"Using MinerU Local API at {self.api_url}")
|
||||
|
||||
filename = os.path.basename(self.file_path)
|
||||
|
||||
# Build form data for Local API
|
||||
form_data = {
|
||||
"return_md": "true",
|
||||
"formula_enable": str(self.enable_formula).lower(),
|
||||
"table_enable": str(self.enable_table).lower(),
|
||||
}
|
||||
|
||||
# Parse method based on OCR setting
|
||||
if self.enable_ocr:
|
||||
form_data["parse_method"] = "ocr"
|
||||
else:
|
||||
form_data["parse_method"] = "auto"
|
||||
|
||||
# Language configuration (Local API uses lang_list array)
|
||||
if self.language:
|
||||
form_data["lang_list"] = self.language
|
||||
|
||||
# Backend/model version (Local API uses "backend" parameter)
|
||||
if self.model_version == "vlm":
|
||||
form_data["backend"] = "vlm-vllm-engine"
|
||||
else:
|
||||
form_data["backend"] = "pipeline"
|
||||
|
||||
# Page ranges (Local API uses start_page_id and end_page_id)
|
||||
if self.page_ranges:
|
||||
# For simplicity, if page_ranges is specified, log a warning
|
||||
# Full page range parsing would require parsing the string
|
||||
log.warning(
|
||||
f"Page ranges '{self.page_ranges}' specified but Local API uses different format. "
|
||||
"Consider using start_page_id/end_page_id parameters if needed."
|
||||
)
|
||||
|
||||
try:
|
||||
with open(self.file_path, "rb") as f:
|
||||
files = {"files": (filename, f, "application/octet-stream")}
|
||||
|
||||
log.info(f"Sending file to MinerU Local API: {filename}")
|
||||
log.debug(f"Local API parameters: {form_data}")
|
||||
|
||||
response = requests.post(
|
||||
f"{self.api_url}/file_parse",
|
||||
data=form_data,
|
||||
files=files,
|
||||
timeout=300, # 5 minute timeout for large documents
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(
|
||||
status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}"
|
||||
)
|
||||
except requests.Timeout:
|
||||
raise HTTPException(
|
||||
status.HTTP_504_GATEWAY_TIMEOUT,
|
||||
detail="MinerU Local API request timed out",
|
||||
)
|
||||
except requests.HTTPError as e:
|
||||
error_detail = f"MinerU Local API request failed: {e}"
|
||||
if e.response is not None:
|
||||
try:
|
||||
error_data = e.response.json()
|
||||
error_detail += f" - {error_data}"
|
||||
except:
|
||||
error_detail += f" - {e.response.text}"
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error calling MinerU Local API: {str(e)}",
|
||||
)
|
||||
|
||||
# Parse response
|
||||
try:
|
||||
result = response.json()
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Invalid JSON response from MinerU Local API: {e}",
|
||||
)
|
||||
|
||||
# Extract markdown content from response
|
||||
if "results" not in result:
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail="MinerU Local API response missing 'results' field",
|
||||
)
|
||||
|
||||
results = result["results"]
|
||||
if not results:
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail="MinerU returned empty results",
|
||||
)
|
||||
|
||||
# Get the first (and typically only) result
|
||||
file_result = list(results.values())[0]
|
||||
markdown_content = file_result.get("md_content", "")
|
||||
|
||||
if not markdown_content:
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail="MinerU returned empty markdown content",
|
||||
)
|
||||
|
||||
log.info(f"Successfully parsed document with MinerU Local API: {filename}")
|
||||
|
||||
# Create metadata
|
||||
metadata = {
|
||||
"source": filename,
|
||||
"api_mode": "local",
|
||||
"backend": result.get("backend", "unknown"),
|
||||
"version": result.get("version", "unknown"),
|
||||
}
|
||||
|
||||
return [Document(page_content=markdown_content, metadata=metadata)]
|
||||
|
||||
def _load_cloud_api(self) -> List[Document]:
|
||||
"""
|
||||
Load document using Cloud API (asynchronous).
|
||||
Uses batch upload endpoint to avoid need for public file URLs.
|
||||
"""
|
||||
log.info(f"Using MinerU Cloud API at {self.api_url}")
|
||||
|
||||
filename = os.path.basename(self.file_path)
|
||||
|
||||
# Step 1: Request presigned upload URL
|
||||
batch_id, upload_url = self._request_upload_url(filename)
|
||||
|
||||
# Step 2: Upload file to presigned URL
|
||||
self._upload_to_presigned_url(upload_url)
|
||||
|
||||
# Step 3: Poll for results
|
||||
result = self._poll_batch_status(batch_id, filename)
|
||||
|
||||
# Step 4: Download and extract markdown from ZIP
|
||||
markdown_content = self._download_and_extract_zip(
|
||||
result["full_zip_url"], filename
|
||||
)
|
||||
|
||||
log.info(f"Successfully parsed document with MinerU Cloud API: {filename}")
|
||||
|
||||
# Create metadata
|
||||
metadata = {
|
||||
"source": filename,
|
||||
"api_mode": "cloud",
|
||||
"batch_id": batch_id,
|
||||
}
|
||||
|
||||
return [Document(page_content=markdown_content, metadata=metadata)]
|
||||
|
||||
def _request_upload_url(self, filename: str) -> tuple:
|
||||
"""
|
||||
Request presigned upload URL from Cloud API.
|
||||
Returns (batch_id, upload_url).
|
||||
"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
# Build request body
|
||||
request_body = {
|
||||
"enable_formula": self.enable_formula,
|
||||
"enable_table": self.enable_table,
|
||||
"language": self.language,
|
||||
"model_version": self.model_version,
|
||||
"files": [
|
||||
{
|
||||
"name": filename,
|
||||
"is_ocr": self.enable_ocr,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Add page ranges if specified
|
||||
if self.page_ranges:
|
||||
request_body["files"][0]["page_ranges"] = self.page_ranges
|
||||
|
||||
log.info(f"Requesting upload URL for: {filename}")
|
||||
log.debug(f"Cloud API request body: {request_body}")
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.api_url}/file-urls/batch",
|
||||
headers=headers,
|
||||
json=request_body,
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
error_detail = f"Failed to request upload URL: {e}"
|
||||
if e.response is not None:
|
||||
try:
|
||||
error_data = e.response.json()
|
||||
error_detail += f" - {error_data.get('msg', error_data)}"
|
||||
except:
|
||||
error_detail += f" - {e.response.text}"
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error requesting upload URL: {str(e)}",
|
||||
)
|
||||
|
||||
try:
|
||||
result = response.json()
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Invalid JSON response: {e}",
|
||||
)
|
||||
|
||||
# Check for API error response
|
||||
if result.get("code") != 0:
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"MinerU Cloud API error: {result.get('msg', 'Unknown error')}",
|
||||
)
|
||||
|
||||
data = result.get("data", {})
|
||||
batch_id = data.get("batch_id")
|
||||
file_urls = data.get("file_urls", [])
|
||||
|
||||
if not batch_id or not file_urls:
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail="MinerU Cloud API response missing batch_id or file_urls",
|
||||
)
|
||||
|
||||
upload_url = file_urls[0]
|
||||
log.info(f"Received upload URL for batch: {batch_id}")
|
||||
|
||||
return batch_id, upload_url
|
||||
|
||||
def _upload_to_presigned_url(self, upload_url: str) -> None:
|
||||
"""
|
||||
Upload file to presigned URL (no authentication needed).
|
||||
"""
|
||||
log.info(f"Uploading file to presigned URL")
|
||||
|
||||
try:
|
||||
with open(self.file_path, "rb") as f:
|
||||
response = requests.put(
|
||||
upload_url,
|
||||
data=f,
|
||||
timeout=300, # 5 minute timeout for large files
|
||||
)
|
||||
response.raise_for_status()
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(
|
||||
status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}"
|
||||
)
|
||||
except requests.Timeout:
|
||||
raise HTTPException(
|
||||
status.HTTP_504_GATEWAY_TIMEOUT,
|
||||
detail="File upload to presigned URL timed out",
|
||||
)
|
||||
except requests.HTTPError as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Failed to upload file to presigned URL: {e}",
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error uploading file: {str(e)}",
|
||||
)
|
||||
|
||||
log.info("File uploaded successfully")
|
||||
|
||||
def _poll_batch_status(self, batch_id: str, filename: str) -> dict:
|
||||
"""
|
||||
Poll batch status until completion.
|
||||
Returns the result dict for the file.
|
||||
"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
}
|
||||
|
||||
max_iterations = 300 # 10 minutes max (2 seconds per iteration)
|
||||
poll_interval = 2 # seconds
|
||||
|
||||
log.info(f"Polling batch status: {batch_id}")
|
||||
|
||||
for iteration in range(max_iterations):
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.api_url}/extract-results/batch/{batch_id}",
|
||||
headers=headers,
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
error_detail = f"Failed to poll batch status: {e}"
|
||||
if e.response is not None:
|
||||
try:
|
||||
error_data = e.response.json()
|
||||
error_detail += f" - {error_data.get('msg', error_data)}"
|
||||
except:
|
||||
error_detail += f" - {e.response.text}"
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error polling batch status: {str(e)}",
|
||||
)
|
||||
|
||||
try:
|
||||
result = response.json()
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Invalid JSON response while polling: {e}",
|
||||
)
|
||||
|
||||
# Check for API error response
|
||||
if result.get("code") != 0:
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"MinerU Cloud API error: {result.get('msg', 'Unknown error')}",
|
||||
)
|
||||
|
||||
data = result.get("data", {})
|
||||
extract_result = data.get("extract_result", [])
|
||||
|
||||
# Find our file in the batch results
|
||||
file_result = None
|
||||
for item in extract_result:
|
||||
if item.get("file_name") == filename:
|
||||
file_result = item
|
||||
break
|
||||
|
||||
if not file_result:
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"File {filename} not found in batch results",
|
||||
)
|
||||
|
||||
state = file_result.get("state")
|
||||
|
||||
if state == "done":
|
||||
log.info(f"Processing complete for {filename}")
|
||||
return file_result
|
||||
elif state == "failed":
|
||||
error_msg = file_result.get("err_msg", "Unknown error")
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"MinerU processing failed: {error_msg}",
|
||||
)
|
||||
elif state in ["waiting-file", "pending", "running", "converting"]:
|
||||
# Still processing
|
||||
if iteration % 10 == 0: # Log every 20 seconds
|
||||
log.info(
|
||||
f"Processing status: {state} (iteration {iteration + 1}/{max_iterations})"
|
||||
)
|
||||
time.sleep(poll_interval)
|
||||
else:
|
||||
log.warning(f"Unknown state: {state}")
|
||||
time.sleep(poll_interval)
|
||||
|
||||
# Timeout
|
||||
raise HTTPException(
|
||||
status.HTTP_504_GATEWAY_TIMEOUT,
|
||||
detail="MinerU processing timed out after 10 minutes",
|
||||
)
|
||||
|
||||
def _download_and_extract_zip(self, zip_url: str, filename: str) -> str:
|
||||
"""
|
||||
Download ZIP file from CDN and extract markdown content.
|
||||
Returns the markdown content as a string.
|
||||
"""
|
||||
log.info(f"Downloading results from: {zip_url}")
|
||||
|
||||
try:
|
||||
response = requests.get(zip_url, timeout=60)
|
||||
response.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Failed to download results ZIP: {e}",
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error downloading results: {str(e)}",
|
||||
)
|
||||
|
||||
# Save ZIP to temporary file and extract
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip:
|
||||
tmp_zip.write(response.content)
|
||||
tmp_zip_path = tmp_zip.name
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
# Extract ZIP
|
||||
with zipfile.ZipFile(tmp_zip_path, "r") as zip_ref:
|
||||
zip_ref.extractall(tmp_dir)
|
||||
|
||||
# Find markdown file - search recursively for any .md file
|
||||
markdown_content = None
|
||||
found_md_path = None
|
||||
|
||||
# First, list all files in the ZIP for debugging
|
||||
all_files = []
|
||||
for root, dirs, files in os.walk(tmp_dir):
|
||||
for file in files:
|
||||
full_path = os.path.join(root, file)
|
||||
all_files.append(full_path)
|
||||
# Look for any .md file
|
||||
if file.endswith(".md"):
|
||||
found_md_path = full_path
|
||||
log.info(f"Found markdown file at: {full_path}")
|
||||
try:
|
||||
with open(full_path, "r", encoding="utf-8") as f:
|
||||
markdown_content = f.read()
|
||||
if (
|
||||
markdown_content
|
||||
): # Use the first non-empty markdown file
|
||||
break
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to read {full_path}: {e}")
|
||||
if markdown_content:
|
||||
break
|
||||
|
||||
if markdown_content is None:
|
||||
log.error(f"Available files in ZIP: {all_files}")
|
||||
# Try to provide more helpful error message
|
||||
md_files = [f for f in all_files if f.endswith(".md")]
|
||||
if md_files:
|
||||
error_msg = (
|
||||
f"Found .md files but couldn't read them: {md_files}"
|
||||
)
|
||||
else:
|
||||
error_msg = (
|
||||
f"No .md files found in ZIP. Available files: {all_files}"
|
||||
)
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail=error_msg,
|
||||
)
|
||||
|
||||
# Clean up temporary ZIP file
|
||||
os.unlink(tmp_zip_path)
|
||||
|
||||
except zipfile.BadZipFile as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Invalid ZIP file received: {e}",
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error extracting ZIP: {str(e)}",
|
||||
)
|
||||
|
||||
if not markdown_content:
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail="Extracted markdown content is empty",
|
||||
)
|
||||
|
||||
log.info(
|
||||
f"Successfully extracted markdown content ({len(markdown_content)} characters)"
|
||||
)
|
||||
return markdown_content
|
||||
|
|
@ -466,6 +466,11 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
|||
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||
# MinerU settings
|
||||
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
||||
"MINERU_API_URL": request.app.state.config.MINERU_API_URL,
|
||||
"MINERU_API_KEY": request.app.state.config.MINERU_API_KEY,
|
||||
"MINERU_PARAMS": request.app.state.config.MINERU_PARAMS,
|
||||
# Reranking settings
|
||||
"RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
|
||||
"RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
|
||||
|
|
@ -647,6 +652,12 @@ class ConfigForm(BaseModel):
|
|||
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
|
||||
MISTRAL_OCR_API_KEY: Optional[str] = None
|
||||
|
||||
# MinerU settings
|
||||
MINERU_API_MODE: Optional[str] = None
|
||||
MINERU_API_URL: Optional[str] = None
|
||||
MINERU_API_KEY: Optional[str] = None
|
||||
MINERU_PARAMS: Optional[dict] = None
|
||||
|
||||
# Reranking settings
|
||||
RAG_RERANKING_MODEL: Optional[str] = None
|
||||
RAG_RERANKING_ENGINE: Optional[str] = None
|
||||
|
|
@ -886,6 +897,28 @@ async def update_rag_config(
|
|||
else request.app.state.config.MISTRAL_OCR_API_KEY
|
||||
)
|
||||
|
||||
# MinerU settings
|
||||
request.app.state.config.MINERU_API_MODE = (
|
||||
form_data.MINERU_API_MODE
|
||||
if form_data.MINERU_API_MODE is not None
|
||||
else request.app.state.config.MINERU_API_MODE
|
||||
)
|
||||
request.app.state.config.MINERU_API_URL = (
|
||||
form_data.MINERU_API_URL
|
||||
if form_data.MINERU_API_URL is not None
|
||||
else request.app.state.config.MINERU_API_URL
|
||||
)
|
||||
request.app.state.config.MINERU_API_KEY = (
|
||||
form_data.MINERU_API_KEY
|
||||
if form_data.MINERU_API_KEY is not None
|
||||
else request.app.state.config.MINERU_API_KEY
|
||||
)
|
||||
request.app.state.config.MINERU_PARAMS = (
|
||||
form_data.MINERU_PARAMS
|
||||
if form_data.MINERU_PARAMS is not None
|
||||
else request.app.state.config.MINERU_PARAMS
|
||||
)
|
||||
|
||||
# Reranking settings
|
||||
if request.app.state.config.RAG_RERANKING_ENGINE == "":
|
||||
# Unloading the internal reranker and clear VRAM memory
|
||||
|
|
@ -1150,6 +1183,11 @@ async def update_rag_config(
|
|||
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||
# MinerU settings
|
||||
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
||||
"MINERU_API_URL": request.app.state.config.MINERU_API_URL,
|
||||
"MINERU_API_KEY": request.app.state.config.MINERU_API_KEY,
|
||||
"MINERU_PARAMS": request.app.state.config.MINERU_PARAMS,
|
||||
# Reranking settings
|
||||
"RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
|
||||
"RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
|
||||
|
|
@ -1560,6 +1598,10 @@ def process_file(
|
|||
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||
MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||
MINERU_API_MODE=request.app.state.config.MINERU_API_MODE,
|
||||
MINERU_API_URL=request.app.state.config.MINERU_API_URL,
|
||||
MINERU_API_KEY=request.app.state.config.MINERU_API_KEY,
|
||||
MINERU_PARAMS=request.app.state.config.MINERU_PARAMS,
|
||||
)
|
||||
docs = loader.load(
|
||||
file.filename, file.meta.get("content_type"), file_path
|
||||
|
|
|
|||
|
|
@ -207,6 +207,15 @@
|
|||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru' &&
|
||||
RAGConfig.MINERU_API_MODE === 'cloud' &&
|
||||
RAGConfig.MINERU_API_KEY === ''
|
||||
) {
|
||||
toast.error($i18n.t('MinerU API Key required for Cloud API mode.'));
|
||||
return;
|
||||
}
|
||||
|
||||
if (!RAGConfig.BYPASS_EMBEDDING_AND_RETRIEVAL) {
|
||||
await embeddingModelUpdateHandler();
|
||||
}
|
||||
|
|
@ -337,6 +346,7 @@
|
|||
<option value="datalab_marker">{$i18n.t('Datalab Marker API')}</option>
|
||||
<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
|
||||
<option value="mistral_ocr">{$i18n.t('Mistral OCR')}</option>
|
||||
<option value="mineru">{$i18n.t('MinerU')}</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
|
@ -749,6 +759,88 @@
|
|||
bind:value={RAGConfig.MISTRAL_OCR_API_KEY}
|
||||
/>
|
||||
</div>
|
||||
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru'}
|
||||
<!-- API Mode Selection -->
|
||||
<div class="flex w-full mt-2">
|
||||
<div class="flex-1 flex justify-between">
|
||||
<div class="self-center text-xs font-medium">
|
||||
{$i18n.t('API Mode')}
|
||||
</div>
|
||||
<select
|
||||
class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden"
|
||||
bind:value={RAGConfig.MINERU_API_MODE}
|
||||
on:change={() => {
|
||||
// Auto-update URL when switching modes if it's empty or matches the opposite mode's default
|
||||
const cloudUrl = 'https://mineru.net/api/v4';
|
||||
const localUrl = 'http://localhost:8000';
|
||||
|
||||
if (RAGConfig.MINERU_API_MODE === 'cloud') {
|
||||
if (!RAGConfig.MINERU_API_URL || RAGConfig.MINERU_API_URL === localUrl) {
|
||||
RAGConfig.MINERU_API_URL = cloudUrl;
|
||||
}
|
||||
} else {
|
||||
if (!RAGConfig.MINERU_API_URL || RAGConfig.MINERU_API_URL === cloudUrl) {
|
||||
RAGConfig.MINERU_API_URL = localUrl;
|
||||
}
|
||||
}
|
||||
}}
|
||||
>
|
||||
<option value="local">{$i18n.t('Self-Hosted')}</option>
|
||||
<option value="cloud">{$i18n.t('minerU managed (Cloud API)')}</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- API URL -->
|
||||
<div class="flex w-full mt-2">
|
||||
<input
|
||||
class="flex-1 w-full text-sm bg-transparent outline-hidden"
|
||||
placeholder={RAGConfig.MINERU_API_MODE === 'cloud'
|
||||
? $i18n.t('https://mineru.net/api/v4')
|
||||
: $i18n.t('http://localhost:8000')}
|
||||
bind:value={RAGConfig.MINERU_API_URL}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<!-- API Key (Cloud only) -->
|
||||
{#if RAGConfig.MINERU_API_MODE === 'cloud'}
|
||||
<div class="flex w-full mt-2">
|
||||
<SensitiveInput
|
||||
placeholder={$i18n.t('Enter MinerU API Key')}
|
||||
bind:value={RAGConfig.MINERU_API_KEY}
|
||||
/>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<!-- Parameters -->
|
||||
<div class="flex justify-between w-full mt-2">
|
||||
<div class="self-center text-xs font-medium">
|
||||
<Tooltip
|
||||
content={$i18n.t('Advanced parameters for MinerU parsing (enable_ocr, enable_formula, enable_table, language, model_version, page_ranges)')}
|
||||
placement="top-start"
|
||||
>
|
||||
{$i18n.t('Parameters')}
|
||||
</Tooltip>
|
||||
</div>
|
||||
<div class="">
|
||||
<Textarea
|
||||
value={typeof RAGConfig.MINERU_PARAMS === 'object' && RAGConfig.MINERU_PARAMS !== null && Object.keys(RAGConfig.MINERU_PARAMS).length > 0
|
||||
? JSON.stringify(RAGConfig.MINERU_PARAMS, null, 2)
|
||||
: ''}
|
||||
on:input={(e) => {
|
||||
try {
|
||||
const value = e.target.value.trim();
|
||||
RAGConfig.MINERU_PARAMS = value ? JSON.parse(value) : {};
|
||||
} catch (err) {
|
||||
// Keep the string value if JSON is invalid (user is still typing)
|
||||
RAGConfig.MINERU_PARAMS = e.target.value;
|
||||
}
|
||||
}}
|
||||
placeholder={`{\n "enable_ocr": false,\n "enable_formula": true,\n "enable_table": true,\n "language": "en",\n "model_version": "pipeline",\n "page_ranges": ""\n}`}
|
||||
minSize={100}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user