feat: Add support for DOCX and HTML file formats using docling

- Rename PDFConverter to DocumentConverter with multi-format support
- Add SUPPORTED_FORMATS mapping for PDF, DOCX, HTML, HTM extensions
- Update indexing pipeline to use DocumentConverter
- Update file validation across all frontend components and scripts
- Preserve existing PDF OCR detection logic
- Add format-specific conversion methods for different document types

Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
This commit is contained in:
Devin AI 2025-07-21 20:40:39 +00:00
parent 6f69e61473
commit d5929ce29b
11 changed files with 149 additions and 103 deletions

View File

@ -101,7 +101,7 @@ class IndexCreator:
elif choice == "2":
dir_path = self.get_user_input("Enter directory path")
if os.path.isdir(dir_path):
supported_extensions = ['.pdf', '.txt', '.docx', '.md']
supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']
found_docs = []
for ext in supported_extensions:
@ -369,4 +369,4 @@ def main():
if __name__ == "__main__":
main()
main()

View File

@ -113,7 +113,7 @@ class BatchIndexingDemo:
if os.path.exists(doc_path):
# Check file extension
ext = Path(doc_path).suffix.lower()
if ext in ['.pdf', '.txt', '.docx', '.md']:
if ext in ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']:
valid_documents.append(doc_path)
print(f"{doc_path}")
else:
@ -383,4 +383,4 @@ comprehensive processing pipelines.
if __name__ == "__main__":
main()
main()

View File

@ -0,0 +1,114 @@
from typing import List, Tuple, Dict, Any
from docling.document_converter import DocumentConverter as DoclingConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
from docling.datamodel.base_models import InputFormat
import fitz # PyMuPDF for quick text inspection
import os
class DocumentConverter:
"""
A class to convert various document formats to structured Markdown using the docling library.
Supports PDF, DOCX, HTML, and other formats.
"""
# Mapping of file extensions to InputFormat
SUPPORTED_FORMATS = {
'.pdf': InputFormat.PDF,
'.docx': InputFormat.DOCX,
'.html': InputFormat.HTML,
'.htm': InputFormat.HTML,
}
def __init__(self):
"""Initializes the docling document converter with forced OCR enabled for macOS."""
try:
# --- Converter WITHOUT OCR (fast path) ---
pipeline_no_ocr = PdfPipelineOptions()
pipeline_no_ocr.do_ocr = False
format_no_ocr = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
}
self.converter_no_ocr = DoclingConverter(format_options=format_no_ocr)
# --- Converter WITH OCR (fallback) ---
pipeline_ocr = PdfPipelineOptions()
pipeline_ocr.do_ocr = True
ocr_options = OcrMacOptions(force_full_page_ocr=True)
pipeline_ocr.ocr_options = ocr_options
format_ocr = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
}
self.converter_ocr = DoclingConverter(format_options=format_ocr)
self.converter_general = DoclingConverter()
print("docling DocumentConverter(s) initialized (OCR + no-OCR + general).")
except Exception as e:
print(f"Error initializing docling DocumentConverter(s): {e}")
self.converter_no_ocr = None
self.converter_ocr = None
self.converter_general = None
def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
"""
Converts a document to a single Markdown string, preserving layout and tables.
Supports PDF, DOCX, HTML, and other formats.
"""
if not (self.converter_no_ocr and self.converter_ocr and self.converter_general):
print("docling converters not available. Skipping conversion.")
return []
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext not in self.SUPPORTED_FORMATS:
print(f"Unsupported file format: {file_ext}")
return []
input_format = self.SUPPORTED_FORMATS[file_ext]
if input_format == InputFormat.PDF:
return self._convert_pdf_to_markdown(file_path)
else:
return self._convert_general_to_markdown(file_path, input_format)
def _convert_pdf_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
"""Convert PDF with OCR detection logic."""
# Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
def _pdf_has_text(path: str) -> bool:
try:
doc = fitz.open(path)
for page in doc:
if page.get_text("text").strip():
return True
except Exception:
pass
return False
use_ocr = not _pdf_has_text(pdf_path)
converter = self.converter_ocr if use_ocr else self.converter_no_ocr
ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"
print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
return self._perform_conversion(pdf_path, converter, ocr_msg)
def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]:
"""Convert non-PDF formats using general converter."""
print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...")
return self._perform_conversion(file_path, self.converter_general, f"({input_format.name})")
def _perform_conversion(self, file_path: str, converter, format_msg: str) -> List[Tuple[str, Dict[str, Any]]]:
"""Perform the actual conversion using the specified converter."""
pages_data = []
try:
result = converter.convert(file_path)
markdown_content = result.document.export_to_markdown()
metadata = {"source": file_path}
# Return the *DoclingDocument* object as third tuple element so downstream
# chunkers that understand the element tree can use it. Legacy callers that
# expect only (markdown, metadata) can simply ignore the extra value.
pages_data.append((markdown_content, metadata, result.document))
print(f"Successfully converted {file_path} with docling {format_msg}.")
return pages_data
except Exception as e:
print(f"Error processing {file_path} with docling: {e}")
return []

View File

@ -1,76 +0,0 @@
from typing import List, Tuple, Dict, Any
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
from docling.datamodel.base_models import InputFormat
import fitz # PyMuPDF for quick text inspection
class PDFConverter:
"""
A class to convert PDF files to structured Markdown using the docling library.
"""
def __init__(self):
"""Initializes the docling document converter with forced OCR enabled for macOS."""
try:
# --- Converter WITHOUT OCR (fast path) ---
pipeline_no_ocr = PdfPipelineOptions()
pipeline_no_ocr.do_ocr = False
format_no_ocr = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
}
self.converter_no_ocr = DocumentConverter(format_options=format_no_ocr)
# --- Converter WITH OCR (fallback) ---
pipeline_ocr = PdfPipelineOptions()
pipeline_ocr.do_ocr = True
ocr_options = OcrMacOptions(force_full_page_ocr=True)
pipeline_ocr.ocr_options = ocr_options
format_ocr = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
}
self.converter_ocr = DocumentConverter(format_options=format_ocr)
print("docling DocumentConverter(s) initialized (OCR + no-OCR).")
except Exception as e:
print(f"Error initializing docling DocumentConverter(s): {e}")
self.converter_no_ocr = None
self.converter_ocr = None
def convert_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
"""
Converts a PDF to a single Markdown string, preserving layout and tables.
"""
if not (self.converter_no_ocr and self.converter_ocr):
print("docling converters not available. Skipping conversion.")
return []
# Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
def _pdf_has_text(path: str) -> bool:
try:
doc = fitz.open(path)
for page in doc:
if page.get_text("text").strip():
return True
except Exception:
pass
return False
use_ocr = not _pdf_has_text(pdf_path)
converter = self.converter_ocr if use_ocr else self.converter_no_ocr
ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"
print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
pages_data = []
try:
result = converter.convert(pdf_path)
markdown_content = result.document.export_to_markdown()
metadata = {"source": pdf_path}
# Return the *DoclingDocument* object as third tuple element so downstream
# chunkers that understand the element tree can use it. Legacy callers that
# expect only (markdown, metadata) can simply ignore the extra value.
pages_data.append((markdown_content, metadata, result.document))
print(f"Successfully converted {pdf_path} with docling {ocr_msg}.")
return pages_data
except Exception as e:
print(f"Error processing PDF {pdf_path} with docling: {e}")
return []

View File

@ -1,7 +1,7 @@
from typing import List, Dict, Any
import os
import networkx as nx
from rag_system.ingestion.pdf_converter import PDFConverter
from rag_system.ingestion.document_converter import DocumentConverter
from rag_system.ingestion.chunking import MarkdownRecursiveChunker
from rag_system.indexing.representations import EmbeddingGenerator, select_embedder
from rag_system.indexing.embedders import LanceDBManager, VectorIndexer
@ -15,7 +15,7 @@ class IndexingPipeline:
self.config = config
self.llm_client = ollama_client
self.ollama_config = ollama_config
self.pdf_converter = PDFConverter()
self.document_converter = DocumentConverter()
# Chunker selection: docling (token-based) or legacy (character-based)
chunker_mode = config.get("chunker_mode", "docling")
@ -157,7 +157,7 @@ class IndexingPipeline:
document_id = os.path.basename(file_path)
print(f"Processing: {document_id}")
pages_data = self.pdf_converter.convert_to_markdown(file_path)
pages_data = self.document_converter.convert_to_markdown(file_path)
file_chunks = []
for tpl in pages_data:

View File

@ -71,7 +71,7 @@ validate_documents() {
if [ -f "$doc" ]; then
# Check file extension
case "${doc##*.}" in
pdf|txt|docx|md)
pdf|txt|docx|md|html|htm)
valid_docs+=("$doc")
print_status "✓ Valid document: $doc"
;;
@ -188,7 +188,7 @@ show_usage() {
echo " $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\""
echo " $0 \"Invoice Collection\" ./invoices/*.pdf"
echo ""
echo "Supported file types: PDF, TXT, DOCX, MD"
echo "Supported file types: PDF, TXT, DOCX, MD, HTML"
}
# Main script
@ -225,4 +225,4 @@ main() {
}
# Run main function with all arguments
main "$@"
main "$@"

View File

@ -98,8 +98,8 @@ export function IndexForm({ onClose, onIndexed }: Props) {
onDrop={(e)=>{e.preventDefault(); if(e.dataTransfer.files) setFiles(e.dataTransfer.files)}}
>
<svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className="mb-2 text-white/80"><path d="M4 16v2a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2v-2"/><polyline points="7 10 12 5 17 10"/><line x1="12" y1="5" x2="12" y2="16"/></svg>
<span className="text-xs text-gray-400">Drag & Drop PDFs here or click to browse</span>
<input id="file-upload" type="file" accept="application/pdf" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
<span className="text-xs text-gray-400">Drag & Drop documents here or click to browse</span>
<input id="file-upload" type="file" accept="application/pdf,.docx,.html,.htm" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
</label>
{files && <p className="mt-1 text-xs text-green-400">{files.length} file(s) selected</p>}
</div>
@ -220,4 +220,4 @@ export function IndexForm({ onClose, onIndexed }: Props) {
</div>
</div>
);
}
}

View File

@ -24,8 +24,8 @@ export function IndexWizard({ onClose }: Props) {
<div className="space-y-4">
<div>
<label className="block text-sm mb-1">PDF files</label>
<input type="file" accept="application/pdf" multiple onChange={handleFile} className="text-sm" />
<label className="block text-sm mb-1">Document files</label>
<input type="file" accept="application/pdf,.docx,.html,.htm" multiple onChange={handleFile} className="text-sm" />
</div>
<div className="grid grid-cols-2 gap-4">
@ -69,4 +69,4 @@ export function IndexWizard({ onClose }: Props) {
</div>
</div>
);
}
}

View File

@ -89,8 +89,12 @@ export function ChatInput({
lastModified: file.lastModified
});
// Only allow PDF files for now
if (file.type === 'application/pdf') {
if (file.type === 'application/pdf' ||
file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
file.type === 'text/html' ||
file.name.toLowerCase().endsWith('.html') ||
file.name.toLowerCase().endsWith('.htm') ||
file.name.toLowerCase().endsWith('.docx')) {
newFiles.push({
id: crypto.randomUUID(),
name: file.name,
@ -99,7 +103,7 @@ export function ChatInput({
file: file,
})
} else {
console.log('🔧 Frontend: File rejected - not PDF:', file.type);
console.log('🔧 Frontend: File rejected - unsupported format:', file.type);
}
}
@ -153,7 +157,7 @@ export function ChatInput({
<div className="bg-white/5 backdrop-blur border border-white/10 rounded-2xl px-5 pt-4 pb-3 space-y-2">
{/* Hidden file input (kept for future use) */}
<input ref={fileInputRef} type="file" accept=".pdf" multiple onChange={handleFileChange} className="hidden" />
<input ref={fileInputRef} type="file" accept=".pdf,.docx,.html,.htm" multiple onChange={handleFileChange} className="hidden" />
{/* Textarea */}
<textarea
@ -200,4 +204,4 @@ export function ChatInput({
</form>
</div>
)
}
}

View File

@ -115,8 +115,12 @@ export function EmptyChatState({
const newFiles: AttachedFile[] = [];
for (let i = 0; i < files.length; i++) {
const file = files[i];
// Only allow PDF files for now
if (file.type === 'application/pdf') {
if (file.type === 'application/pdf' ||
file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
file.type === 'text/html' ||
file.name.toLowerCase().endsWith('.html') ||
file.name.toLowerCase().endsWith('.htm') ||
file.name.toLowerCase().endsWith('.docx')) {
newFiles.push({
id: crypto.randomUUID(),
name: file.name,
@ -220,7 +224,7 @@ export function EmptyChatState({
<input
ref={fileInputRef}
type="file"
accept=".pdf"
accept=".pdf,.docx,.html,.htm"
multiple
onChange={handleFileChange}
className="hidden"
@ -278,4 +282,4 @@ export function EmptyChatState({
</div>
</div>
);
}
}

View File

@ -6,7 +6,7 @@
<body>
<h1>Test PDF Upload</h1>
<form id="uploadForm">
<input type="file" id="fileInput" accept=".pdf" />
<input type="file" id="fileInput" accept=".pdf,.docx,.html,.htm" />
<button type="submit">Upload PDF</button>
</form>
@ -51,4 +51,4 @@
});
</script>
</body>
</html>
</html>