From 40e9d9c33064adfc444400884bd1b81588510b12 Mon Sep 17 00:00:00 2001 From: palazski Date: Mon, 13 Oct 2025 21:09:52 +0300 Subject: [PATCH 1/2] feat: add mineru as document parser support with both local and managed api --- backend/open_webui/config.py | 55 ++ backend/open_webui/main.py | 18 + backend/open_webui/retrieval/loaders/main.py | 22 + .../open_webui/retrieval/loaders/mineru.py | 543 ++++++++++++++++++ backend/open_webui/routers/retrieval.py | 87 +++ .../admin/Settings/Documents.svelte | 141 +++++ 6 files changed, 866 insertions(+) create mode 100644 backend/open_webui/retrieval/loaders/mineru.py diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index bd7380762..0d2191346 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2291,6 +2291,61 @@ DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig( os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"), ) +# MinerU Configuration +MINERU_API_MODE = PersistentConfig( + "MINERU_API_MODE", + "rag.mineru_api_mode", + os.environ.get("MINERU_API_MODE", "local"), # "local" or "cloud" +) + +MINERU_API_URL = PersistentConfig( + "MINERU_API_URL", + "rag.mineru_api_url", + os.environ.get("MINERU_API_URL", "http://localhost:8000"), +) + +MINERU_API_KEY = PersistentConfig( + "MINERU_API_KEY", + "rag.mineru_api_key", + os.environ.get("MINERU_API_KEY", ""), +) + +MINERU_ENABLE_OCR = PersistentConfig( + "MINERU_ENABLE_OCR", + "rag.mineru_enable_ocr", + os.environ.get("MINERU_ENABLE_OCR", "false").lower() == "true", +) + +MINERU_ENABLE_FORMULA = PersistentConfig( + "MINERU_ENABLE_FORMULA", + "rag.mineru_enable_formula", + os.environ.get("MINERU_ENABLE_FORMULA", "true").lower() == "true", +) + +MINERU_ENABLE_TABLE = PersistentConfig( + "MINERU_ENABLE_TABLE", + "rag.mineru_enable_table", + os.environ.get("MINERU_ENABLE_TABLE", "true").lower() == "true", +) + +MINERU_LANGUAGE = PersistentConfig( + "MINERU_LANGUAGE", + "rag.mineru_language", + os.environ.get("MINERU_LANGUAGE", "en"), +) + +MINERU_MODEL_VERSION = PersistentConfig( + "MINERU_MODEL_VERSION", + "rag.mineru_model_version", + os.environ.get("MINERU_MODEL_VERSION", "pipeline"), # "pipeline" or "vlm" +) + +MINERU_PAGE_RANGES = PersistentConfig( + "MINERU_PAGE_RANGES", + "rag.mineru_page_ranges", + os.environ.get("MINERU_PAGE_RANGES", ""), +) + EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig( "EXTERNAL_DOCUMENT_LOADER_URL", "rag.external_document_loader_url", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 221c20f30..efb897d2b 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -243,6 +243,15 @@ from open_webui.config import ( DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, DATALAB_MARKER_FORMAT_LINES, DATALAB_MARKER_OUTPUT_FORMAT, + MINERU_API_MODE, + MINERU_API_URL, + MINERU_API_KEY, + MINERU_ENABLE_OCR, + MINERU_ENABLE_FORMULA, + MINERU_ENABLE_TABLE, + MINERU_LANGUAGE, + MINERU_MODEL_VERSION, + MINERU_PAGE_RANGES, DATALAB_MARKER_USE_LLM, EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_API_KEY, @@ -853,6 +862,15 @@ app.state.config.DOCLING_PICTURE_DESCRIPTION_API = DOCLING_PICTURE_DESCRIPTION_A app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY +app.state.config.MINERU_API_MODE = MINERU_API_MODE +app.state.config.MINERU_API_URL = MINERU_API_URL +app.state.config.MINERU_API_KEY = MINERU_API_KEY +app.state.config.MINERU_ENABLE_OCR = MINERU_ENABLE_OCR +app.state.config.MINERU_ENABLE_FORMULA = MINERU_ENABLE_FORMULA +app.state.config.MINERU_ENABLE_TABLE = MINERU_ENABLE_TABLE +app.state.config.MINERU_LANGUAGE = MINERU_LANGUAGE +app.state.config.MINERU_MODEL_VERSION = MINERU_MODEL_VERSION +app.state.config.MINERU_PAGE_RANGES = MINERU_PAGE_RANGES app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index b3d90cc8f..cb41cea84 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -27,6 +27,7 @@ from open_webui.retrieval.loaders.external_document import ExternalDocumentLoade from open_webui.retrieval.loaders.mistral import MistralLoader from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader +from open_webui.retrieval.loaders.mineru import MinerULoader from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL @@ -367,6 +368,27 @@ class Loader: api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"), azure_credential=DefaultAzureCredential(), ) + elif self.engine == "mineru" and file_ext in [ + "pdf", + "doc", + "docx", + "ppt", + "pptx", + "xls", + "xlsx", + ]: + loader = MinerULoader( + file_path=file_path, + api_mode=self.kwargs.get("MINERU_API_MODE", "local"), + api_url=self.kwargs.get("MINERU_API_URL", "http://localhost:8000"), + api_key=self.kwargs.get("MINERU_API_KEY", ""), + enable_ocr=self.kwargs.get("MINERU_ENABLE_OCR", False), + enable_formula=self.kwargs.get("MINERU_ENABLE_FORMULA", True), + enable_table=self.kwargs.get("MINERU_ENABLE_TABLE", True), + language=self.kwargs.get("MINERU_LANGUAGE", "en"), + model_version=self.kwargs.get("MINERU_MODEL_VERSION", "pipeline"), + page_ranges=self.kwargs.get("MINERU_PAGE_RANGES", ""), + ) elif ( self.engine == "mistral_ocr" and self.kwargs.get("MISTRAL_OCR_API_KEY") != "" diff --git a/backend/open_webui/retrieval/loaders/mineru.py b/backend/open_webui/retrieval/loaders/mineru.py new file mode 100644 index 000000000..0dbf12d87 --- /dev/null +++ b/backend/open_webui/retrieval/loaders/mineru.py @@ -0,0 +1,543 @@ +import os +import time +import requests +import logging +import tempfile +import zipfile +from typing import List, Optional +from langchain_core.documents import Document +from fastapi import HTTPException, status + +log = logging.getLogger(__name__) + + +class MinerULoader: + """ + MinerU document parser loader supporting both Cloud API and Local API modes. + + Cloud API: Uses MinerU managed service with async task-based processing + Local API: Uses self-hosted MinerU API with synchronous processing + """ + + def __init__( + self, + file_path: str, + api_mode: str = "local", + api_url: str = "http://localhost:8000", + api_key: str = "", + enable_ocr: bool = False, + enable_formula: bool = True, + enable_table: bool = True, + language: str = "en", + model_version: str = "pipeline", + page_ranges: str = "", + ): + self.file_path = file_path + self.api_mode = api_mode.lower() + self.api_url = api_url.rstrip("/") + self.api_key = api_key + self.enable_ocr = enable_ocr + self.enable_formula = enable_formula + self.enable_table = enable_table + self.language = language + self.model_version = model_version + self.page_ranges = page_ranges + + # Validate API mode + if self.api_mode not in ["local", "cloud"]: + raise ValueError( + f"Invalid API mode: {self.api_mode}. Must be 'local' or 'cloud'" + ) + + # Validate Cloud API requirements + if self.api_mode == "cloud" and not self.api_key: + raise ValueError("API key is required for Cloud API mode") + + def load(self) -> List[Document]: + """ + Main entry point for loading and parsing the document. + Routes to Cloud or Local API based on api_mode. + """ + try: + if self.api_mode == "cloud": + return self._load_cloud_api() + else: + return self._load_local_api() + except Exception as e: + log.error(f"Error loading document with MinerU: {e}") + raise + + def _load_local_api(self) -> List[Document]: + """ + Load document using Local API (synchronous). + Posts file to /file_parse endpoint and gets immediate response. + """ + log.info(f"Using MinerU Local API at {self.api_url}") + + filename = os.path.basename(self.file_path) + + # Build form data for Local API + form_data = { + "return_md": "true", + "formula_enable": str(self.enable_formula).lower(), + "table_enable": str(self.enable_table).lower(), + } + + # Parse method based on OCR setting + if self.enable_ocr: + form_data["parse_method"] = "ocr" + else: + form_data["parse_method"] = "auto" + + # Language configuration (Local API uses lang_list array) + if self.language: + form_data["lang_list"] = self.language + + # Backend/model version (Local API uses "backend" parameter) + if self.model_version == "vlm": + form_data["backend"] = "vlm-vllm-engine" + else: + form_data["backend"] = "pipeline" + + # Page ranges (Local API uses start_page_id and end_page_id) + if self.page_ranges: + # For simplicity, if page_ranges is specified, log a warning + # Full page range parsing would require parsing the string + log.warning( + f"Page ranges '{self.page_ranges}' specified but Local API uses different format. " + "Consider using start_page_id/end_page_id parameters if needed." + ) + + try: + with open(self.file_path, "rb") as f: + files = {"files": (filename, f, "application/octet-stream")} + + log.info(f"Sending file to MinerU Local API: {filename}") + log.debug(f"Local API parameters: {form_data}") + + response = requests.post( + f"{self.api_url}/file_parse", + data=form_data, + files=files, + timeout=300, # 5 minute timeout for large documents + ) + response.raise_for_status() + + except FileNotFoundError: + raise HTTPException( + status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}" + ) + except requests.Timeout: + raise HTTPException( + status.HTTP_504_GATEWAY_TIMEOUT, + detail="MinerU Local API request timed out", + ) + except requests.HTTPError as e: + error_detail = f"MinerU Local API request failed: {e}" + if e.response is not None: + try: + error_data = e.response.json() + error_detail += f" - {error_data}" + except: + error_detail += f" - {e.response.text}" + raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail) + except Exception as e: + raise HTTPException( + status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error calling MinerU Local API: {str(e)}", + ) + + # Parse response + try: + result = response.json() + except ValueError as e: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=f"Invalid JSON response from MinerU Local API: {e}", + ) + + # Extract markdown content from response + if "results" not in result: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail="MinerU Local API response missing 'results' field", + ) + + results = result["results"] + if not results: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail="MinerU returned empty results", + ) + + # Get the first (and typically only) result + file_result = list(results.values())[0] + markdown_content = file_result.get("md_content", "") + + if not markdown_content: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail="MinerU returned empty markdown content", + ) + + log.info(f"Successfully parsed document with MinerU Local API: {filename}") + + # Create metadata + metadata = { + "source": filename, + "api_mode": "local", + "backend": result.get("backend", "unknown"), + "version": result.get("version", "unknown"), + } + + return [Document(page_content=markdown_content, metadata=metadata)] + + def _load_cloud_api(self) -> List[Document]: + """ + Load document using Cloud API (asynchronous). + Uses batch upload endpoint to avoid need for public file URLs. + """ + log.info(f"Using MinerU Cloud API at {self.api_url}") + + filename = os.path.basename(self.file_path) + + # Step 1: Request presigned upload URL + batch_id, upload_url = self._request_upload_url(filename) + + # Step 2: Upload file to presigned URL + self._upload_to_presigned_url(upload_url) + + # Step 3: Poll for results + result = self._poll_batch_status(batch_id, filename) + + # Step 4: Download and extract markdown from ZIP + markdown_content = self._download_and_extract_zip( + result["full_zip_url"], filename + ) + + log.info(f"Successfully parsed document with MinerU Cloud API: {filename}") + + # Create metadata + metadata = { + "source": filename, + "api_mode": "cloud", + "batch_id": batch_id, + } + + return [Document(page_content=markdown_content, metadata=metadata)] + + def _request_upload_url(self, filename: str) -> tuple: + """ + Request presigned upload URL from Cloud API. + Returns (batch_id, upload_url). + """ + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + # Build request body + request_body = { + "enable_formula": self.enable_formula, + "enable_table": self.enable_table, + "language": self.language, + "model_version": self.model_version, + "files": [ + { + "name": filename, + "is_ocr": self.enable_ocr, + } + ], + } + + # Add page ranges if specified + if self.page_ranges: + request_body["files"][0]["page_ranges"] = self.page_ranges + + log.info(f"Requesting upload URL for: {filename}") + log.debug(f"Cloud API request body: {request_body}") + + try: + response = requests.post( + f"{self.api_url}/file-urls/batch", + headers=headers, + json=request_body, + timeout=30, + ) + response.raise_for_status() + except requests.HTTPError as e: + error_detail = f"Failed to request upload URL: {e}" + if e.response is not None: + try: + error_data = e.response.json() + error_detail += f" - {error_data.get('msg', error_data)}" + except: + error_detail += f" - {e.response.text}" + raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail) + except Exception as e: + raise HTTPException( + status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error requesting upload URL: {str(e)}", + ) + + try: + result = response.json() + except ValueError as e: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=f"Invalid JSON response: {e}", + ) + + # Check for API error response + if result.get("code") != 0: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"MinerU Cloud API error: {result.get('msg', 'Unknown error')}", + ) + + data = result.get("data", {}) + batch_id = data.get("batch_id") + file_urls = data.get("file_urls", []) + + if not batch_id or not file_urls: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail="MinerU Cloud API response missing batch_id or file_urls", + ) + + upload_url = file_urls[0] + log.info(f"Received upload URL for batch: {batch_id}") + + return batch_id, upload_url + + def _upload_to_presigned_url(self, upload_url: str) -> None: + """ + Upload file to presigned URL (no authentication needed). + """ + log.info(f"Uploading file to presigned URL") + + try: + with open(self.file_path, "rb") as f: + response = requests.put( + upload_url, + data=f, + timeout=300, # 5 minute timeout for large files + ) + response.raise_for_status() + except FileNotFoundError: + raise HTTPException( + status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}" + ) + except requests.Timeout: + raise HTTPException( + status.HTTP_504_GATEWAY_TIMEOUT, + detail="File upload to presigned URL timed out", + ) + except requests.HTTPError as e: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Failed to upload file to presigned URL: {e}", + ) + except Exception as e: + raise HTTPException( + status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error uploading file: {str(e)}", + ) + + log.info("File uploaded successfully") + + def _poll_batch_status(self, batch_id: str, filename: str) -> dict: + """ + Poll batch status until completion. + Returns the result dict for the file. + """ + headers = { + "Authorization": f"Bearer {self.api_key}", + } + + max_iterations = 300 # 10 minutes max (2 seconds per iteration) + poll_interval = 2 # seconds + + log.info(f"Polling batch status: {batch_id}") + + for iteration in range(max_iterations): + try: + response = requests.get( + f"{self.api_url}/extract-results/batch/{batch_id}", + headers=headers, + timeout=30, + ) + response.raise_for_status() + except requests.HTTPError as e: + error_detail = f"Failed to poll batch status: {e}" + if e.response is not None: + try: + error_data = e.response.json() + error_detail += f" - {error_data.get('msg', error_data)}" + except: + error_detail += f" - {e.response.text}" + raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail) + except Exception as e: + raise HTTPException( + status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error polling batch status: {str(e)}", + ) + + try: + result = response.json() + except ValueError as e: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=f"Invalid JSON response while polling: {e}", + ) + + # Check for API error response + if result.get("code") != 0: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"MinerU Cloud API error: {result.get('msg', 'Unknown error')}", + ) + + data = result.get("data", {}) + extract_result = data.get("extract_result", []) + + # Find our file in the batch results + file_result = None + for item in extract_result: + if item.get("file_name") == filename: + file_result = item + break + + if not file_result: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=f"File {filename} not found in batch results", + ) + + state = file_result.get("state") + + if state == "done": + log.info(f"Processing complete for {filename}") + return file_result + elif state == "failed": + error_msg = file_result.get("err_msg", "Unknown error") + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"MinerU processing failed: {error_msg}", + ) + elif state in ["waiting-file", "pending", "running", "converting"]: + # Still processing + if iteration % 10 == 0: # Log every 20 seconds + log.info( + f"Processing status: {state} (iteration {iteration + 1}/{max_iterations})" + ) + time.sleep(poll_interval) + else: + log.warning(f"Unknown state: {state}") + time.sleep(poll_interval) + + # Timeout + raise HTTPException( + status.HTTP_504_GATEWAY_TIMEOUT, + detail="MinerU processing timed out after 10 minutes", + ) + + def _download_and_extract_zip(self, zip_url: str, filename: str) -> str: + """ + Download ZIP file from CDN and extract markdown content. + Returns the markdown content as a string. + """ + log.info(f"Downloading results from: {zip_url}") + + try: + response = requests.get(zip_url, timeout=60) + response.raise_for_status() + except requests.HTTPError as e: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Failed to download results ZIP: {e}", + ) + except Exception as e: + raise HTTPException( + status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error downloading results: {str(e)}", + ) + + # Save ZIP to temporary file and extract + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip: + tmp_zip.write(response.content) + tmp_zip_path = tmp_zip.name + + with tempfile.TemporaryDirectory() as tmp_dir: + # Extract ZIP + with zipfile.ZipFile(tmp_zip_path, "r") as zip_ref: + zip_ref.extractall(tmp_dir) + + # Find markdown file - search recursively for any .md file + markdown_content = None + found_md_path = None + + # First, list all files in the ZIP for debugging + all_files = [] + for root, dirs, files in os.walk(tmp_dir): + for file in files: + full_path = os.path.join(root, file) + all_files.append(full_path) + # Look for any .md file + if file.endswith(".md"): + found_md_path = full_path + log.info(f"Found markdown file at: {full_path}") + try: + with open(full_path, "r", encoding="utf-8") as f: + markdown_content = f.read() + if ( + markdown_content + ): # Use the first non-empty markdown file + break + except Exception as e: + log.warning(f"Failed to read {full_path}: {e}") + if markdown_content: + break + + if markdown_content is None: + log.error(f"Available files in ZIP: {all_files}") + # Try to provide more helpful error message + md_files = [f for f in all_files if f.endswith(".md")] + if md_files: + error_msg = ( + f"Found .md files but couldn't read them: {md_files}" + ) + else: + error_msg = ( + f"No .md files found in ZIP. Available files: {all_files}" + ) + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=error_msg, + ) + + # Clean up temporary ZIP file + os.unlink(tmp_zip_path) + + except zipfile.BadZipFile as e: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=f"Invalid ZIP file received: {e}", + ) + except Exception as e: + raise HTTPException( + status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error extracting ZIP: {str(e)}", + ) + + if not markdown_content: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail="Extracted markdown content is empty", + ) + + log.info( + f"Successfully extracted markdown content ({len(markdown_content)} characters)" + ) + return markdown_content diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index c79d3ce65..6823c6294 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -466,6 +466,16 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, + # MinerU settings + "MINERU_API_MODE": request.app.state.config.MINERU_API_MODE, + "MINERU_API_URL": request.app.state.config.MINERU_API_URL, + "MINERU_API_KEY": request.app.state.config.MINERU_API_KEY, + "MINERU_ENABLE_OCR": request.app.state.config.MINERU_ENABLE_OCR, + "MINERU_ENABLE_FORMULA": request.app.state.config.MINERU_ENABLE_FORMULA, + "MINERU_ENABLE_TABLE": request.app.state.config.MINERU_ENABLE_TABLE, + "MINERU_LANGUAGE": request.app.state.config.MINERU_LANGUAGE, + "MINERU_MODEL_VERSION": request.app.state.config.MINERU_MODEL_VERSION, + "MINERU_PAGE_RANGES": request.app.state.config.MINERU_PAGE_RANGES, # Reranking settings "RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL, "RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE, @@ -647,6 +657,17 @@ class ConfigForm(BaseModel): DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None MISTRAL_OCR_API_KEY: Optional[str] = None + # MinerU settings + MINERU_API_MODE: Optional[str] = None + MINERU_API_URL: Optional[str] = None + MINERU_API_KEY: Optional[str] = None + MINERU_ENABLE_OCR: Optional[bool] = None + MINERU_ENABLE_FORMULA: Optional[bool] = None + MINERU_ENABLE_TABLE: Optional[bool] = None + MINERU_LANGUAGE: Optional[str] = None + MINERU_MODEL_VERSION: Optional[str] = None + MINERU_PAGE_RANGES: Optional[str] = None + # Reranking settings RAG_RERANKING_MODEL: Optional[str] = None RAG_RERANKING_ENGINE: Optional[str] = None @@ -886,6 +907,53 @@ async def update_rag_config( else request.app.state.config.MISTRAL_OCR_API_KEY ) + # MinerU settings + request.app.state.config.MINERU_API_MODE = ( + form_data.MINERU_API_MODE + if form_data.MINERU_API_MODE is not None + else request.app.state.config.MINERU_API_MODE + ) + request.app.state.config.MINERU_API_URL = ( + form_data.MINERU_API_URL + if form_data.MINERU_API_URL is not None + else request.app.state.config.MINERU_API_URL + ) + request.app.state.config.MINERU_API_KEY = ( + form_data.MINERU_API_KEY + if form_data.MINERU_API_KEY is not None + else request.app.state.config.MINERU_API_KEY + ) + request.app.state.config.MINERU_ENABLE_OCR = ( + form_data.MINERU_ENABLE_OCR + if form_data.MINERU_ENABLE_OCR is not None + else request.app.state.config.MINERU_ENABLE_OCR + ) + request.app.state.config.MINERU_ENABLE_FORMULA = ( + form_data.MINERU_ENABLE_FORMULA + if form_data.MINERU_ENABLE_FORMULA is not None + else request.app.state.config.MINERU_ENABLE_FORMULA + ) + request.app.state.config.MINERU_ENABLE_TABLE = ( + form_data.MINERU_ENABLE_TABLE + if form_data.MINERU_ENABLE_TABLE is not None + else request.app.state.config.MINERU_ENABLE_TABLE + ) + request.app.state.config.MINERU_LANGUAGE = ( + form_data.MINERU_LANGUAGE + if form_data.MINERU_LANGUAGE is not None + else request.app.state.config.MINERU_LANGUAGE + ) + request.app.state.config.MINERU_MODEL_VERSION = ( + form_data.MINERU_MODEL_VERSION + if form_data.MINERU_MODEL_VERSION is not None + else request.app.state.config.MINERU_MODEL_VERSION + ) + request.app.state.config.MINERU_PAGE_RANGES = ( + form_data.MINERU_PAGE_RANGES + if form_data.MINERU_PAGE_RANGES is not None + else request.app.state.config.MINERU_PAGE_RANGES + ) + # Reranking settings if request.app.state.config.RAG_RERANKING_ENGINE == "": # Unloading the internal reranker and clear VRAM memory @@ -1150,6 +1218,16 @@ async def update_rag_config( "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, + # MinerU settings + "MINERU_API_MODE": request.app.state.config.MINERU_API_MODE, + "MINERU_API_URL": request.app.state.config.MINERU_API_URL, + "MINERU_API_KEY": request.app.state.config.MINERU_API_KEY, + "MINERU_ENABLE_OCR": request.app.state.config.MINERU_ENABLE_OCR, + "MINERU_ENABLE_FORMULA": request.app.state.config.MINERU_ENABLE_FORMULA, + "MINERU_ENABLE_TABLE": request.app.state.config.MINERU_ENABLE_TABLE, + "MINERU_LANGUAGE": request.app.state.config.MINERU_LANGUAGE, + "MINERU_MODEL_VERSION": request.app.state.config.MINERU_MODEL_VERSION, + "MINERU_PAGE_RANGES": request.app.state.config.MINERU_PAGE_RANGES, # Reranking settings "RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL, "RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE, @@ -1560,6 +1638,15 @@ def process_file( DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY, + MINERU_API_MODE=request.app.state.config.MINERU_API_MODE, + MINERU_API_URL=request.app.state.config.MINERU_API_URL, + MINERU_API_KEY=request.app.state.config.MINERU_API_KEY, + MINERU_ENABLE_OCR=request.app.state.config.MINERU_ENABLE_OCR, + MINERU_ENABLE_FORMULA=request.app.state.config.MINERU_ENABLE_FORMULA, + MINERU_ENABLE_TABLE=request.app.state.config.MINERU_ENABLE_TABLE, + MINERU_LANGUAGE=request.app.state.config.MINERU_LANGUAGE, + MINERU_MODEL_VERSION=request.app.state.config.MINERU_MODEL_VERSION, + MINERU_PAGE_RANGES=request.app.state.config.MINERU_PAGE_RANGES, ) docs = loader.load( file.filename, file.meta.get("content_type"), file_path diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 77b390374..7e338d9d6 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -207,6 +207,15 @@ return; } + if ( + RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru' && + RAGConfig.MINERU_API_MODE === 'cloud' && + RAGConfig.MINERU_API_KEY === '' + ) { + toast.error($i18n.t('MinerU API Key required for Cloud API mode.')); + return; + } + if (!RAGConfig.BYPASS_EMBEDDING_AND_RETRIEVAL) { await embeddingModelUpdateHandler(); } @@ -337,6 +346,7 @@ + @@ -749,6 +759,137 @@ bind:value={RAGConfig.MISTRAL_OCR_API_KEY} /> + {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru'} + +
+
+
+ {$i18n.t('API Mode')} +
+ +
+
+ + +
+ +
+ + + {#if RAGConfig.MINERU_API_MODE === 'cloud'} +
+ +
+ {/if} + + +
+
+
+ {$i18n.t('Enable OCR (for scanned documents)')} +
+
+ +
+
+
+ + +
+
+
+ {$i18n.t('Enable Formula Recognition')} +
+
+ +
+
+
+ + +
+
+
+ {$i18n.t('Enable Table Recognition')} +
+
+ +
+
+
+ + +
+ + {$i18n.t('Advanced Settings')} + + +
+ +
+
+
+ {$i18n.t('Model Version')} +
+ +
+
+ + +
+ +
+ + +
+ +
+
+
{/if} From 288b323df8ecd41a77969516ef8107b27ef110df Mon Sep 17 00:00:00 2001 From: palazski Date: Wed, 15 Oct 2025 22:59:59 +0300 Subject: [PATCH 2/2] feat: use MINERU_PARAMS json field for mineru settings --- backend/open_webui/config.py | 43 ++---- backend/open_webui/main.py | 14 +- backend/open_webui/retrieval/loaders/main.py | 7 +- .../open_webui/retrieval/loaders/mineru.py | 22 ++- backend/open_webui/routers/retrieval.py | 61 +------- .../admin/Settings/Documents.svelte | 143 ++++++------------ 6 files changed, 77 insertions(+), 213 deletions(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 0d2191346..a40d3bf2e 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2291,7 +2291,6 @@ DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig( os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"), ) -# MinerU Configuration MINERU_API_MODE = PersistentConfig( "MINERU_API_MODE", "rag.mineru_api_mode", @@ -2310,40 +2309,16 @@ MINERU_API_KEY = PersistentConfig( os.environ.get("MINERU_API_KEY", ""), ) -MINERU_ENABLE_OCR = PersistentConfig( - "MINERU_ENABLE_OCR", - "rag.mineru_enable_ocr", - os.environ.get("MINERU_ENABLE_OCR", "false").lower() == "true", -) +mineru_params = os.getenv("MINERU_PARAMS", "") +try: + mineru_params = json.loads(mineru_params) +except json.JSONDecodeError: + mineru_params = {} -MINERU_ENABLE_FORMULA = PersistentConfig( - "MINERU_ENABLE_FORMULA", - "rag.mineru_enable_formula", - os.environ.get("MINERU_ENABLE_FORMULA", "true").lower() == "true", -) - -MINERU_ENABLE_TABLE = PersistentConfig( - "MINERU_ENABLE_TABLE", - "rag.mineru_enable_table", - os.environ.get("MINERU_ENABLE_TABLE", "true").lower() == "true", -) - -MINERU_LANGUAGE = PersistentConfig( - "MINERU_LANGUAGE", - "rag.mineru_language", - os.environ.get("MINERU_LANGUAGE", "en"), -) - -MINERU_MODEL_VERSION = PersistentConfig( - "MINERU_MODEL_VERSION", - "rag.mineru_model_version", - os.environ.get("MINERU_MODEL_VERSION", "pipeline"), # "pipeline" or "vlm" -) - -MINERU_PAGE_RANGES = PersistentConfig( - "MINERU_PAGE_RANGES", - "rag.mineru_page_ranges", - os.environ.get("MINERU_PAGE_RANGES", ""), +MINERU_PARAMS = PersistentConfig( + "MINERU_PARAMS", + "rag.mineru_params", + mineru_params, ) EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig( diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index efb897d2b..9998af0e7 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -246,12 +246,7 @@ from open_webui.config import ( MINERU_API_MODE, MINERU_API_URL, MINERU_API_KEY, - MINERU_ENABLE_OCR, - MINERU_ENABLE_FORMULA, - MINERU_ENABLE_TABLE, - MINERU_LANGUAGE, - MINERU_MODEL_VERSION, - MINERU_PAGE_RANGES, + MINERU_PARAMS, DATALAB_MARKER_USE_LLM, EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_API_KEY, @@ -865,12 +860,7 @@ app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY app.state.config.MINERU_API_MODE = MINERU_API_MODE app.state.config.MINERU_API_URL = MINERU_API_URL app.state.config.MINERU_API_KEY = MINERU_API_KEY -app.state.config.MINERU_ENABLE_OCR = MINERU_ENABLE_OCR -app.state.config.MINERU_ENABLE_FORMULA = MINERU_ENABLE_FORMULA -app.state.config.MINERU_ENABLE_TABLE = MINERU_ENABLE_TABLE -app.state.config.MINERU_LANGUAGE = MINERU_LANGUAGE -app.state.config.MINERU_MODEL_VERSION = MINERU_MODEL_VERSION -app.state.config.MINERU_PAGE_RANGES = MINERU_PAGE_RANGES +app.state.config.MINERU_PARAMS = MINERU_PARAMS app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index cb41cea84..2ef1d75e0 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -382,12 +382,7 @@ class Loader: api_mode=self.kwargs.get("MINERU_API_MODE", "local"), api_url=self.kwargs.get("MINERU_API_URL", "http://localhost:8000"), api_key=self.kwargs.get("MINERU_API_KEY", ""), - enable_ocr=self.kwargs.get("MINERU_ENABLE_OCR", False), - enable_formula=self.kwargs.get("MINERU_ENABLE_FORMULA", True), - enable_table=self.kwargs.get("MINERU_ENABLE_TABLE", True), - language=self.kwargs.get("MINERU_LANGUAGE", "en"), - model_version=self.kwargs.get("MINERU_MODEL_VERSION", "pipeline"), - page_ranges=self.kwargs.get("MINERU_PAGE_RANGES", ""), + params=self.kwargs.get("MINERU_PARAMS", {}), ) elif ( self.engine == "mistral_ocr" diff --git a/backend/open_webui/retrieval/loaders/mineru.py b/backend/open_webui/retrieval/loaders/mineru.py index 0dbf12d87..437f44ae6 100644 --- a/backend/open_webui/retrieval/loaders/mineru.py +++ b/backend/open_webui/retrieval/loaders/mineru.py @@ -25,23 +25,21 @@ class MinerULoader: api_mode: str = "local", api_url: str = "http://localhost:8000", api_key: str = "", - enable_ocr: bool = False, - enable_formula: bool = True, - enable_table: bool = True, - language: str = "en", - model_version: str = "pipeline", - page_ranges: str = "", + params: dict = None, ): self.file_path = file_path self.api_mode = api_mode.lower() self.api_url = api_url.rstrip("/") self.api_key = api_key - self.enable_ocr = enable_ocr - self.enable_formula = enable_formula - self.enable_table = enable_table - self.language = language - self.model_version = model_version - self.page_ranges = page_ranges + + # Parse params dict with defaults + params = params or {} + self.enable_ocr = params.get("enable_ocr", False) + self.enable_formula = params.get("enable_formula", True) + self.enable_table = params.get("enable_table", True) + self.language = params.get("language", "en") + self.model_version = params.get("model_version", "pipeline") + self.page_ranges = params.get("page_ranges", "") # Validate API mode if self.api_mode not in ["local", "cloud"]: diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 6823c6294..cb66e8926 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -470,12 +470,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "MINERU_API_MODE": request.app.state.config.MINERU_API_MODE, "MINERU_API_URL": request.app.state.config.MINERU_API_URL, "MINERU_API_KEY": request.app.state.config.MINERU_API_KEY, - "MINERU_ENABLE_OCR": request.app.state.config.MINERU_ENABLE_OCR, - "MINERU_ENABLE_FORMULA": request.app.state.config.MINERU_ENABLE_FORMULA, - "MINERU_ENABLE_TABLE": request.app.state.config.MINERU_ENABLE_TABLE, - "MINERU_LANGUAGE": request.app.state.config.MINERU_LANGUAGE, - "MINERU_MODEL_VERSION": request.app.state.config.MINERU_MODEL_VERSION, - "MINERU_PAGE_RANGES": request.app.state.config.MINERU_PAGE_RANGES, + "MINERU_PARAMS": request.app.state.config.MINERU_PARAMS, # Reranking settings "RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL, "RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE, @@ -661,12 +656,7 @@ class ConfigForm(BaseModel): MINERU_API_MODE: Optional[str] = None MINERU_API_URL: Optional[str] = None MINERU_API_KEY: Optional[str] = None - MINERU_ENABLE_OCR: Optional[bool] = None - MINERU_ENABLE_FORMULA: Optional[bool] = None - MINERU_ENABLE_TABLE: Optional[bool] = None - MINERU_LANGUAGE: Optional[str] = None - MINERU_MODEL_VERSION: Optional[str] = None - MINERU_PAGE_RANGES: Optional[str] = None + MINERU_PARAMS: Optional[dict] = None # Reranking settings RAG_RERANKING_MODEL: Optional[str] = None @@ -923,35 +913,10 @@ async def update_rag_config( if form_data.MINERU_API_KEY is not None else request.app.state.config.MINERU_API_KEY ) - request.app.state.config.MINERU_ENABLE_OCR = ( - form_data.MINERU_ENABLE_OCR - if form_data.MINERU_ENABLE_OCR is not None - else request.app.state.config.MINERU_ENABLE_OCR - ) - request.app.state.config.MINERU_ENABLE_FORMULA = ( - form_data.MINERU_ENABLE_FORMULA - if form_data.MINERU_ENABLE_FORMULA is not None - else request.app.state.config.MINERU_ENABLE_FORMULA - ) - request.app.state.config.MINERU_ENABLE_TABLE = ( - form_data.MINERU_ENABLE_TABLE - if form_data.MINERU_ENABLE_TABLE is not None - else request.app.state.config.MINERU_ENABLE_TABLE - ) - request.app.state.config.MINERU_LANGUAGE = ( - form_data.MINERU_LANGUAGE - if form_data.MINERU_LANGUAGE is not None - else request.app.state.config.MINERU_LANGUAGE - ) - request.app.state.config.MINERU_MODEL_VERSION = ( - form_data.MINERU_MODEL_VERSION - if form_data.MINERU_MODEL_VERSION is not None - else request.app.state.config.MINERU_MODEL_VERSION - ) - request.app.state.config.MINERU_PAGE_RANGES = ( - form_data.MINERU_PAGE_RANGES - if form_data.MINERU_PAGE_RANGES is not None - else request.app.state.config.MINERU_PAGE_RANGES + request.app.state.config.MINERU_PARAMS = ( + form_data.MINERU_PARAMS + if form_data.MINERU_PARAMS is not None + else request.app.state.config.MINERU_PARAMS ) # Reranking settings @@ -1222,12 +1187,7 @@ async def update_rag_config( "MINERU_API_MODE": request.app.state.config.MINERU_API_MODE, "MINERU_API_URL": request.app.state.config.MINERU_API_URL, "MINERU_API_KEY": request.app.state.config.MINERU_API_KEY, - "MINERU_ENABLE_OCR": request.app.state.config.MINERU_ENABLE_OCR, - "MINERU_ENABLE_FORMULA": request.app.state.config.MINERU_ENABLE_FORMULA, - "MINERU_ENABLE_TABLE": request.app.state.config.MINERU_ENABLE_TABLE, - "MINERU_LANGUAGE": request.app.state.config.MINERU_LANGUAGE, - "MINERU_MODEL_VERSION": request.app.state.config.MINERU_MODEL_VERSION, - "MINERU_PAGE_RANGES": request.app.state.config.MINERU_PAGE_RANGES, + "MINERU_PARAMS": request.app.state.config.MINERU_PARAMS, # Reranking settings "RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL, "RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE, @@ -1641,12 +1601,7 @@ def process_file( MINERU_API_MODE=request.app.state.config.MINERU_API_MODE, MINERU_API_URL=request.app.state.config.MINERU_API_URL, MINERU_API_KEY=request.app.state.config.MINERU_API_KEY, - MINERU_ENABLE_OCR=request.app.state.config.MINERU_ENABLE_OCR, - MINERU_ENABLE_FORMULA=request.app.state.config.MINERU_ENABLE_FORMULA, - MINERU_ENABLE_TABLE=request.app.state.config.MINERU_ENABLE_TABLE, - MINERU_LANGUAGE=request.app.state.config.MINERU_LANGUAGE, - MINERU_MODEL_VERSION=request.app.state.config.MINERU_MODEL_VERSION, - MINERU_PAGE_RANGES=request.app.state.config.MINERU_PAGE_RANGES, + MINERU_PARAMS=request.app.state.config.MINERU_PARAMS, ) docs = loader.load( file.filename, file.meta.get("content_type"), file_path diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 7e338d9d6..fbda44c4b 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -791,106 +791,57 @@ - + +
+ +
+ + + {#if RAGConfig.MINERU_API_MODE === 'cloud'}
-
- - - {#if RAGConfig.MINERU_API_MODE === 'cloud'} -
- -
- {/if} - - -
-
-
- {$i18n.t('Enable OCR (for scanned documents)')} -
-
- -
-
-
- - -
-
-
- {$i18n.t('Enable Formula Recognition')} -
-
- -
-
-
- - -
-
-
- {$i18n.t('Enable Table Recognition')} -
-
- -
-
-
- - -
- - {$i18n.t('Advanced Settings')} - - -
- -
-
-
- {$i18n.t('Model Version')} -
- -
-
- - -
- -
- - -
- -
-
-
{/if} + + +
+
+ + {$i18n.t('Parameters')} + +
+
+