mirror of
https://github.com/zebrajr/localGPT.git
synced 2025-12-06 00:20:19 +01:00
feat: Add support for DOCX and HTML file formats using docling
- Rename PDFConverter to DocumentConverter with multi-format support - Add SUPPORTED_FORMATS mapping for PDF, DOCX, HTML, HTM extensions - Update indexing pipeline to use DocumentConverter - Update file validation across all frontend components and scripts - Preserve existing PDF OCR detection logic - Add format-specific conversion methods for different document types Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
This commit is contained in:
parent
6f69e61473
commit
d5929ce29b
|
|
@ -101,7 +101,7 @@ class IndexCreator:
|
|||
elif choice == "2":
|
||||
dir_path = self.get_user_input("Enter directory path")
|
||||
if os.path.isdir(dir_path):
|
||||
supported_extensions = ['.pdf', '.txt', '.docx', '.md']
|
||||
supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']
|
||||
found_docs = []
|
||||
|
||||
for ext in supported_extensions:
|
||||
|
|
@ -369,4 +369,4 @@ def main():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
|
@ -113,7 +113,7 @@ class BatchIndexingDemo:
|
|||
if os.path.exists(doc_path):
|
||||
# Check file extension
|
||||
ext = Path(doc_path).suffix.lower()
|
||||
if ext in ['.pdf', '.txt', '.docx', '.md']:
|
||||
if ext in ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']:
|
||||
valid_documents.append(doc_path)
|
||||
print(f" ✅ {doc_path}")
|
||||
else:
|
||||
|
|
@ -383,4 +383,4 @@ comprehensive processing pipelines.
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
114
rag_system/ingestion/document_converter.py
Normal file
114
rag_system/ingestion/document_converter.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
from typing import List, Tuple, Dict, Any
|
||||
from docling.document_converter import DocumentConverter as DoclingConverter, PdfFormatOption
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
import fitz # PyMuPDF for quick text inspection
|
||||
import os
|
||||
|
||||
class DocumentConverter:
|
||||
"""
|
||||
A class to convert various document formats to structured Markdown using the docling library.
|
||||
Supports PDF, DOCX, HTML, and other formats.
|
||||
"""
|
||||
|
||||
# Mapping of file extensions to InputFormat
|
||||
SUPPORTED_FORMATS = {
|
||||
'.pdf': InputFormat.PDF,
|
||||
'.docx': InputFormat.DOCX,
|
||||
'.html': InputFormat.HTML,
|
||||
'.htm': InputFormat.HTML,
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the docling document converter with forced OCR enabled for macOS."""
|
||||
try:
|
||||
# --- Converter WITHOUT OCR (fast path) ---
|
||||
pipeline_no_ocr = PdfPipelineOptions()
|
||||
pipeline_no_ocr.do_ocr = False
|
||||
format_no_ocr = {
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
|
||||
}
|
||||
self.converter_no_ocr = DoclingConverter(format_options=format_no_ocr)
|
||||
|
||||
# --- Converter WITH OCR (fallback) ---
|
||||
pipeline_ocr = PdfPipelineOptions()
|
||||
pipeline_ocr.do_ocr = True
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=True)
|
||||
pipeline_ocr.ocr_options = ocr_options
|
||||
format_ocr = {
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
|
||||
}
|
||||
self.converter_ocr = DoclingConverter(format_options=format_ocr)
|
||||
|
||||
self.converter_general = DoclingConverter()
|
||||
|
||||
print("docling DocumentConverter(s) initialized (OCR + no-OCR + general).")
|
||||
except Exception as e:
|
||||
print(f"Error initializing docling DocumentConverter(s): {e}")
|
||||
self.converter_no_ocr = None
|
||||
self.converter_ocr = None
|
||||
self.converter_general = None
|
||||
|
||||
def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
"""
|
||||
Converts a document to a single Markdown string, preserving layout and tables.
|
||||
Supports PDF, DOCX, HTML, and other formats.
|
||||
"""
|
||||
if not (self.converter_no_ocr and self.converter_ocr and self.converter_general):
|
||||
print("docling converters not available. Skipping conversion.")
|
||||
return []
|
||||
|
||||
file_ext = os.path.splitext(file_path)[1].lower()
|
||||
if file_ext not in self.SUPPORTED_FORMATS:
|
||||
print(f"Unsupported file format: {file_ext}")
|
||||
return []
|
||||
|
||||
input_format = self.SUPPORTED_FORMATS[file_ext]
|
||||
|
||||
if input_format == InputFormat.PDF:
|
||||
return self._convert_pdf_to_markdown(file_path)
|
||||
else:
|
||||
return self._convert_general_to_markdown(file_path, input_format)
|
||||
|
||||
def _convert_pdf_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
"""Convert PDF with OCR detection logic."""
|
||||
# Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
|
||||
def _pdf_has_text(path: str) -> bool:
|
||||
try:
|
||||
doc = fitz.open(path)
|
||||
for page in doc:
|
||||
if page.get_text("text").strip():
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
use_ocr = not _pdf_has_text(pdf_path)
|
||||
converter = self.converter_ocr if use_ocr else self.converter_no_ocr
|
||||
ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"
|
||||
|
||||
print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
|
||||
return self._perform_conversion(pdf_path, converter, ocr_msg)
|
||||
|
||||
def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
"""Convert non-PDF formats using general converter."""
|
||||
print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...")
|
||||
return self._perform_conversion(file_path, self.converter_general, f"({input_format.name})")
|
||||
|
||||
def _perform_conversion(self, file_path: str, converter, format_msg: str) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
"""Perform the actual conversion using the specified converter."""
|
||||
pages_data = []
|
||||
try:
|
||||
result = converter.convert(file_path)
|
||||
markdown_content = result.document.export_to_markdown()
|
||||
|
||||
metadata = {"source": file_path}
|
||||
# Return the *DoclingDocument* object as third tuple element so downstream
|
||||
# chunkers that understand the element tree can use it. Legacy callers that
|
||||
# expect only (markdown, metadata) can simply ignore the extra value.
|
||||
pages_data.append((markdown_content, metadata, result.document))
|
||||
print(f"Successfully converted {file_path} with docling {format_msg}.")
|
||||
return pages_data
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path} with docling: {e}")
|
||||
return []
|
||||
|
|
@ -1,76 +0,0 @@
|
|||
from typing import List, Tuple, Dict, Any
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
import fitz # PyMuPDF for quick text inspection
|
||||
|
||||
class PDFConverter:
|
||||
"""
|
||||
A class to convert PDF files to structured Markdown using the docling library.
|
||||
"""
|
||||
def __init__(self):
|
||||
"""Initializes the docling document converter with forced OCR enabled for macOS."""
|
||||
try:
|
||||
# --- Converter WITHOUT OCR (fast path) ---
|
||||
pipeline_no_ocr = PdfPipelineOptions()
|
||||
pipeline_no_ocr.do_ocr = False
|
||||
format_no_ocr = {
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
|
||||
}
|
||||
self.converter_no_ocr = DocumentConverter(format_options=format_no_ocr)
|
||||
|
||||
# --- Converter WITH OCR (fallback) ---
|
||||
pipeline_ocr = PdfPipelineOptions()
|
||||
pipeline_ocr.do_ocr = True
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=True)
|
||||
pipeline_ocr.ocr_options = ocr_options
|
||||
format_ocr = {
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
|
||||
}
|
||||
self.converter_ocr = DocumentConverter(format_options=format_ocr)
|
||||
|
||||
print("docling DocumentConverter(s) initialized (OCR + no-OCR).")
|
||||
except Exception as e:
|
||||
print(f"Error initializing docling DocumentConverter(s): {e}")
|
||||
self.converter_no_ocr = None
|
||||
self.converter_ocr = None
|
||||
|
||||
def convert_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
"""
|
||||
Converts a PDF to a single Markdown string, preserving layout and tables.
|
||||
"""
|
||||
if not (self.converter_no_ocr and self.converter_ocr):
|
||||
print("docling converters not available. Skipping conversion.")
|
||||
return []
|
||||
|
||||
# Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
|
||||
def _pdf_has_text(path: str) -> bool:
|
||||
try:
|
||||
doc = fitz.open(path)
|
||||
for page in doc:
|
||||
if page.get_text("text").strip():
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
use_ocr = not _pdf_has_text(pdf_path)
|
||||
converter = self.converter_ocr if use_ocr else self.converter_no_ocr
|
||||
ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"
|
||||
|
||||
print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
|
||||
pages_data = []
|
||||
try:
|
||||
result = converter.convert(pdf_path)
|
||||
markdown_content = result.document.export_to_markdown()
|
||||
|
||||
metadata = {"source": pdf_path}
|
||||
# Return the *DoclingDocument* object as third tuple element so downstream
|
||||
# chunkers that understand the element tree can use it. Legacy callers that
|
||||
# expect only (markdown, metadata) can simply ignore the extra value.
|
||||
pages_data.append((markdown_content, metadata, result.document))
|
||||
print(f"Successfully converted {pdf_path} with docling {ocr_msg}.")
|
||||
return pages_data
|
||||
except Exception as e:
|
||||
print(f"Error processing PDF {pdf_path} with docling: {e}")
|
||||
return []
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
from typing import List, Dict, Any
|
||||
import os
|
||||
import networkx as nx
|
||||
from rag_system.ingestion.pdf_converter import PDFConverter
|
||||
from rag_system.ingestion.document_converter import DocumentConverter
|
||||
from rag_system.ingestion.chunking import MarkdownRecursiveChunker
|
||||
from rag_system.indexing.representations import EmbeddingGenerator, select_embedder
|
||||
from rag_system.indexing.embedders import LanceDBManager, VectorIndexer
|
||||
|
|
@ -15,7 +15,7 @@ class IndexingPipeline:
|
|||
self.config = config
|
||||
self.llm_client = ollama_client
|
||||
self.ollama_config = ollama_config
|
||||
self.pdf_converter = PDFConverter()
|
||||
self.document_converter = DocumentConverter()
|
||||
# Chunker selection: docling (token-based) or legacy (character-based)
|
||||
chunker_mode = config.get("chunker_mode", "docling")
|
||||
|
||||
|
|
@ -157,7 +157,7 @@ class IndexingPipeline:
|
|||
document_id = os.path.basename(file_path)
|
||||
print(f"Processing: {document_id}")
|
||||
|
||||
pages_data = self.pdf_converter.convert_to_markdown(file_path)
|
||||
pages_data = self.document_converter.convert_to_markdown(file_path)
|
||||
file_chunks = []
|
||||
|
||||
for tpl in pages_data:
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ validate_documents() {
|
|||
if [ -f "$doc" ]; then
|
||||
# Check file extension
|
||||
case "${doc##*.}" in
|
||||
pdf|txt|docx|md)
|
||||
pdf|txt|docx|md|html|htm)
|
||||
valid_docs+=("$doc")
|
||||
print_status "✓ Valid document: $doc"
|
||||
;;
|
||||
|
|
@ -188,7 +188,7 @@ show_usage() {
|
|||
echo " $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\""
|
||||
echo " $0 \"Invoice Collection\" ./invoices/*.pdf"
|
||||
echo ""
|
||||
echo "Supported file types: PDF, TXT, DOCX, MD"
|
||||
echo "Supported file types: PDF, TXT, DOCX, MD, HTML"
|
||||
}
|
||||
|
||||
# Main script
|
||||
|
|
@ -225,4 +225,4 @@ main() {
|
|||
}
|
||||
|
||||
# Run main function with all arguments
|
||||
main "$@"
|
||||
main "$@"
|
||||
|
|
@ -98,8 +98,8 @@ export function IndexForm({ onClose, onIndexed }: Props) {
|
|||
onDrop={(e)=>{e.preventDefault(); if(e.dataTransfer.files) setFiles(e.dataTransfer.files)}}
|
||||
>
|
||||
<svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className="mb-2 text-white/80"><path d="M4 16v2a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2v-2"/><polyline points="7 10 12 5 17 10"/><line x1="12" y1="5" x2="12" y2="16"/></svg>
|
||||
<span className="text-xs text-gray-400">Drag & Drop PDFs here or click to browse</span>
|
||||
<input id="file-upload" type="file" accept="application/pdf" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
|
||||
<span className="text-xs text-gray-400">Drag & Drop documents here or click to browse</span>
|
||||
<input id="file-upload" type="file" accept="application/pdf,.docx,.html,.htm" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
|
||||
</label>
|
||||
{files && <p className="mt-1 text-xs text-green-400">{files.length} file(s) selected</p>}
|
||||
</div>
|
||||
|
|
@ -220,4 +220,4 @@ export function IndexForm({ onClose, onIndexed }: Props) {
|
|||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -24,8 +24,8 @@ export function IndexWizard({ onClose }: Props) {
|
|||
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<label className="block text-sm mb-1">PDF files</label>
|
||||
<input type="file" accept="application/pdf" multiple onChange={handleFile} className="text-sm" />
|
||||
<label className="block text-sm mb-1">Document files</label>
|
||||
<input type="file" accept="application/pdf,.docx,.html,.htm" multiple onChange={handleFile} className="text-sm" />
|
||||
</div>
|
||||
|
||||
<div className="grid grid-cols-2 gap-4">
|
||||
|
|
@ -69,4 +69,4 @@ export function IndexWizard({ onClose }: Props) {
|
|||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -89,8 +89,12 @@ export function ChatInput({
|
|||
lastModified: file.lastModified
|
||||
});
|
||||
|
||||
// Only allow PDF files for now
|
||||
if (file.type === 'application/pdf') {
|
||||
if (file.type === 'application/pdf' ||
|
||||
file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
||||
file.type === 'text/html' ||
|
||||
file.name.toLowerCase().endsWith('.html') ||
|
||||
file.name.toLowerCase().endsWith('.htm') ||
|
||||
file.name.toLowerCase().endsWith('.docx')) {
|
||||
newFiles.push({
|
||||
id: crypto.randomUUID(),
|
||||
name: file.name,
|
||||
|
|
@ -99,7 +103,7 @@ export function ChatInput({
|
|||
file: file,
|
||||
})
|
||||
} else {
|
||||
console.log('🔧 Frontend: File rejected - not PDF:', file.type);
|
||||
console.log('🔧 Frontend: File rejected - unsupported format:', file.type);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -153,7 +157,7 @@ export function ChatInput({
|
|||
|
||||
<div className="bg-white/5 backdrop-blur border border-white/10 rounded-2xl px-5 pt-4 pb-3 space-y-2">
|
||||
{/* Hidden file input (kept for future use) */}
|
||||
<input ref={fileInputRef} type="file" accept=".pdf" multiple onChange={handleFileChange} className="hidden" />
|
||||
<input ref={fileInputRef} type="file" accept=".pdf,.docx,.html,.htm" multiple onChange={handleFileChange} className="hidden" />
|
||||
|
||||
{/* Textarea */}
|
||||
<textarea
|
||||
|
|
@ -200,4 +204,4 @@ export function ChatInput({
|
|||
</form>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
@ -115,8 +115,12 @@ export function EmptyChatState({
|
|||
const newFiles: AttachedFile[] = [];
|
||||
for (let i = 0; i < files.length; i++) {
|
||||
const file = files[i];
|
||||
// Only allow PDF files for now
|
||||
if (file.type === 'application/pdf') {
|
||||
if (file.type === 'application/pdf' ||
|
||||
file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
||||
file.type === 'text/html' ||
|
||||
file.name.toLowerCase().endsWith('.html') ||
|
||||
file.name.toLowerCase().endsWith('.htm') ||
|
||||
file.name.toLowerCase().endsWith('.docx')) {
|
||||
newFiles.push({
|
||||
id: crypto.randomUUID(),
|
||||
name: file.name,
|
||||
|
|
@ -220,7 +224,7 @@ export function EmptyChatState({
|
|||
<input
|
||||
ref={fileInputRef}
|
||||
type="file"
|
||||
accept=".pdf"
|
||||
accept=".pdf,.docx,.html,.htm"
|
||||
multiple
|
||||
onChange={handleFileChange}
|
||||
className="hidden"
|
||||
|
|
@ -278,4 +282,4 @@ export function EmptyChatState({
|
|||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -6,7 +6,7 @@
|
|||
<body>
|
||||
<h1>Test PDF Upload</h1>
|
||||
<form id="uploadForm">
|
||||
<input type="file" id="fileInput" accept=".pdf" />
|
||||
<input type="file" id="fileInput" accept=".pdf,.docx,.html,.htm" />
|
||||
<button type="submit">Upload PDF</button>
|
||||
</form>
|
||||
|
||||
|
|
@ -51,4 +51,4 @@
|
|||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
Loading…
Reference in New Issue
Block a user