feat: Add support for DOCX and HTML file formats using docling

- Rename PDFConverter to DocumentConverter with multi-format support - Add SUPPORTED_FORMATS mapping for PDF, DOCX, HTML, HTM extensions - Update indexing pipeline to use DocumentConverter - Update file validation across all frontend components and scripts - Preserve existing PDF OCR detection logic - Add format-specific conversion methods for different document types Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
2025-12-06 00:20:19 +01:00 · 2025-07-21 20:40:39 +00:00 · 2025-07-21 20:40:39 +00:00 · d5929ce29b
commit d5929ce29b
parent 6f69e61473
11 changed files with 149 additions and 103 deletions
--- a/create_index_script.py
+++ b/create_index_script.py
@ -101,7 +101,7 @@ class IndexCreator:
            elif choice == "2":
                dir_path = self.get_user_input("Enter directory path")
                if os.path.isdir(dir_path):
-                    supported_extensions = ['.pdf', '.txt', '.docx', '.md']
+                    supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']
                    found_docs = []
                    
                    for ext in supported_extensions:
@ -369,4 +369,4 @@ def main():


 if __name__ == "__main__":
-    main() 
+    main()  
--- a/demo_batch_indexing.py
+++ b/demo_batch_indexing.py
@ -113,7 +113,7 @@ class BatchIndexingDemo:
            if os.path.exists(doc_path):
                # Check file extension
                ext = Path(doc_path).suffix.lower()
-                if ext in ['.pdf', '.txt', '.docx', '.md']:
+                if ext in ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']:
                    valid_documents.append(doc_path)
                    print(f"  ✅ {doc_path}")
                else:
@ -383,4 +383,4 @@ comprehensive processing pipelines.


 if __name__ == "__main__":
-    main() 
+    main()  
--- a/rag_system/ingestion/document_converter.py
+++ b/rag_system/ingestion/document_converter.py
@ -0,0 +1,114 @@
+from typing import List, Tuple, Dict, Any
+from docling.document_converter import DocumentConverter as DoclingConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
+from docling.datamodel.base_models import InputFormat
+import fitz  # PyMuPDF for quick text inspection
+import os
+
+class DocumentConverter:
+    """
+    A class to convert various document formats to structured Markdown using the docling library.
+    Supports PDF, DOCX, HTML, and other formats.
+    """
+    
+    # Mapping of file extensions to InputFormat
+    SUPPORTED_FORMATS = {
+        '.pdf': InputFormat.PDF,
+        '.docx': InputFormat.DOCX,
+        '.html': InputFormat.HTML,
+        '.htm': InputFormat.HTML,
+    }
+    
+    def __init__(self):
+        """Initializes the docling document converter with forced OCR enabled for macOS."""
+        try:
+            # --- Converter WITHOUT OCR (fast path) ---
+            pipeline_no_ocr = PdfPipelineOptions()
+            pipeline_no_ocr.do_ocr = False
+            format_no_ocr = {
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
+            }
+            self.converter_no_ocr = DoclingConverter(format_options=format_no_ocr)
+
+            # --- Converter WITH OCR (fallback) ---
+            pipeline_ocr = PdfPipelineOptions()
+            pipeline_ocr.do_ocr = True
+            ocr_options = OcrMacOptions(force_full_page_ocr=True)
+            pipeline_ocr.ocr_options = ocr_options
+            format_ocr = {
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
+            }
+            self.converter_ocr = DoclingConverter(format_options=format_ocr)
+            
+            self.converter_general = DoclingConverter()
+
+            print("docling DocumentConverter(s) initialized (OCR + no-OCR + general).")
+        except Exception as e:
+            print(f"Error initializing docling DocumentConverter(s): {e}")
+            self.converter_no_ocr = None
+            self.converter_ocr = None
+            self.converter_general = None
+
+    def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
+        """
+        Converts a document to a single Markdown string, preserving layout and tables.
+        Supports PDF, DOCX, HTML, and other formats.
+        """
+        if not (self.converter_no_ocr and self.converter_ocr and self.converter_general):
+            print("docling converters not available. Skipping conversion.")
+            return []
+        
+        file_ext = os.path.splitext(file_path)[1].lower()
+        if file_ext not in self.SUPPORTED_FORMATS:
+            print(f"Unsupported file format: {file_ext}")
+            return []
+        
+        input_format = self.SUPPORTED_FORMATS[file_ext]
+        
+        if input_format == InputFormat.PDF:
+            return self._convert_pdf_to_markdown(file_path)
+        else:
+            return self._convert_general_to_markdown(file_path, input_format)
+    
+    def _convert_pdf_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
+        """Convert PDF with OCR detection logic."""
+        # Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
+        def _pdf_has_text(path: str) -> bool:
+            try:
+                doc = fitz.open(path)
+                for page in doc:
+                    if page.get_text("text").strip():
+                        return True
+            except Exception:
+                pass
+            return False
+
+        use_ocr = not _pdf_has_text(pdf_path)
+        converter = self.converter_ocr if use_ocr else self.converter_no_ocr
+        ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"
+
+        print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
+        return self._perform_conversion(pdf_path, converter, ocr_msg)
+    
+    def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]:
+        """Convert non-PDF formats using general converter."""
+        print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...")
+        return self._perform_conversion(file_path, self.converter_general, f"({input_format.name})")
+    
+    def _perform_conversion(self, file_path: str, converter, format_msg: str) -> List[Tuple[str, Dict[str, Any]]]:
+        """Perform the actual conversion using the specified converter."""
+        pages_data = []
+        try:
+            result = converter.convert(file_path)
+            markdown_content = result.document.export_to_markdown()
+            
+            metadata = {"source": file_path}
+            # Return the *DoclingDocument* object as third tuple element so downstream
+            # chunkers that understand the element tree can use it.  Legacy callers that
+            # expect only (markdown, metadata) can simply ignore the extra value.
+            pages_data.append((markdown_content, metadata, result.document))
+            print(f"Successfully converted {file_path} with docling {format_msg}.")
+            return pages_data
+        except Exception as e:
+            print(f"Error processing {file_path} with docling: {e}")
+            return []
--- a/rag_system/ingestion/pdf_converter.py
+++ b/rag_system/ingestion/pdf_converter.py
@ -1,76 +0,0 @@
-from typing import List, Tuple, Dict, Any
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
-from docling.datamodel.base_models import InputFormat
-import fitz  # PyMuPDF for quick text inspection
-
-class PDFConverter:
-    """
-    A class to convert PDF files to structured Markdown using the docling library.
-    """
-    def __init__(self):
-        """Initializes the docling document converter with forced OCR enabled for macOS."""
-        try:
-            # --- Converter WITHOUT OCR (fast path) ---
-            pipeline_no_ocr = PdfPipelineOptions()
-            pipeline_no_ocr.do_ocr = False
-            format_no_ocr = {
-                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
-            }
-            self.converter_no_ocr = DocumentConverter(format_options=format_no_ocr)
-
-            # --- Converter WITH OCR (fallback) ---
-            pipeline_ocr = PdfPipelineOptions()
-            pipeline_ocr.do_ocr = True
-            ocr_options = OcrMacOptions(force_full_page_ocr=True)
-            pipeline_ocr.ocr_options = ocr_options
-            format_ocr = {
-                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
-            }
-            self.converter_ocr = DocumentConverter(format_options=format_ocr)
-
-            print("docling DocumentConverter(s) initialized (OCR + no-OCR).")
-        except Exception as e:
-            print(f"Error initializing docling DocumentConverter(s): {e}")
-            self.converter_no_ocr = None
-            self.converter_ocr = None
-
-    def convert_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
-        """
-        Converts a PDF to a single Markdown string, preserving layout and tables.
-        """
-        if not (self.converter_no_ocr and self.converter_ocr):
-            print("docling converters not available. Skipping conversion.")
-            return []
-            
-        # Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
-        def _pdf_has_text(path: str) -> bool:
-            try:
-                doc = fitz.open(path)
-                for page in doc:
-                    if page.get_text("text").strip():
-                        return True
-            except Exception:
-                pass
-            return False
-
-        use_ocr = not _pdf_has_text(pdf_path)
-        converter = self.converter_ocr if use_ocr else self.converter_no_ocr
-        ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"
-
-        print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
-        pages_data = []
-        try:
-            result = converter.convert(pdf_path)
-            markdown_content = result.document.export_to_markdown()
-            
-            metadata = {"source": pdf_path}
-            # Return the *DoclingDocument* object as third tuple element so downstream
-            # chunkers that understand the element tree can use it.  Legacy callers that
-            # expect only (markdown, metadata) can simply ignore the extra value.
-            pages_data.append((markdown_content, metadata, result.document))
-            print(f"Successfully converted {pdf_path} with docling {ocr_msg}.")
-            return pages_data
-        except Exception as e:
-            print(f"Error processing PDF {pdf_path} with docling: {e}")
-            return []
--- a/rag_system/pipelines/indexing_pipeline.py
+++ b/rag_system/pipelines/indexing_pipeline.py
@ -1,7 +1,7 @@
 from typing import List, Dict, Any
 import os
 import networkx as nx
-from rag_system.ingestion.pdf_converter import PDFConverter
+from rag_system.ingestion.document_converter import DocumentConverter
 from rag_system.ingestion.chunking import MarkdownRecursiveChunker
 from rag_system.indexing.representations import EmbeddingGenerator, select_embedder
 from rag_system.indexing.embedders import LanceDBManager, VectorIndexer
@ -15,7 +15,7 @@ class IndexingPipeline:
        self.config = config
        self.llm_client = ollama_client
        self.ollama_config = ollama_config
-        self.pdf_converter = PDFConverter()
+        self.document_converter = DocumentConverter()
        # Chunker selection: docling (token-based) or legacy (character-based)
        chunker_mode = config.get("chunker_mode", "docling")
        
@ -157,7 +157,7 @@ class IndexingPipeline:
                        document_id = os.path.basename(file_path)
                        print(f"Processing: {document_id}")
                        
-                        pages_data = self.pdf_converter.convert_to_markdown(file_path)
+                        pages_data = self.document_converter.convert_to_markdown(file_path)
                        file_chunks = []
                        
                        for tpl in pages_data:
--- a/simple_create_index.sh
+++ b/simple_create_index.sh
@ -71,7 +71,7 @@ validate_documents() {
        if [ -f "$doc" ]; then
            # Check file extension
            case "${doc##*.}" in
-                pdf|txt|docx|md)
+                pdf|txt|docx|md|html|htm)
                    valid_docs+=("$doc")
                    print_status "✓ Valid document: $doc"
                    ;;
@ -188,7 +188,7 @@ show_usage() {
    echo "  $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\""
    echo "  $0 \"Invoice Collection\" ./invoices/*.pdf"
    echo ""
-    echo "Supported file types: PDF, TXT, DOCX, MD"
+    echo "Supported file types: PDF, TXT, DOCX, MD, HTML"
 }

 # Main script
@ -225,4 +225,4 @@ main() {
 }

 # Run main function with all arguments
-main "$@" 
+main "$@"  
--- a/src/components/IndexForm.tsx
+++ b/src/components/IndexForm.tsx
@ -98,8 +98,8 @@ export function IndexForm({ onClose, onIndexed }: Props) {
            onDrop={(e)=>{e.preventDefault(); if(e.dataTransfer.files) setFiles(e.dataTransfer.files)}}
          >
            <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className="mb-2 text-white/80"><path d="M4 16v2a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2v-2"/><polyline points="7 10 12 5 17 10"/><line x1="12" y1="5" x2="12" y2="16"/></svg>
-            <span className="text-xs text-gray-400">Drag & Drop PDFs here or click to browse</span>
-            <input id="file-upload" type="file" accept="application/pdf" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
+            <span className="text-xs text-gray-400">Drag & Drop documents here or click to browse</span>
+            <input id="file-upload" type="file" accept="application/pdf,.docx,.html,.htm" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
          </label>
          {files && <p className="mt-1 text-xs text-green-400">{files.length} file(s) selected</p>}
        </div>
@ -220,4 +220,4 @@ export function IndexForm({ onClose, onIndexed }: Props) {
      </div>
    </div>
  );
-}      
+}            
--- a/src/components/IndexWizard.tsx
+++ b/src/components/IndexWizard.tsx
@ -24,8 +24,8 @@ export function IndexWizard({ onClose }: Props) {

        <div className="space-y-4">
          <div>
-            <label className="block text-sm mb-1">PDF files</label>
-            <input type="file" accept="application/pdf" multiple onChange={handleFile} className="text-sm" />
+            <label className="block text-sm mb-1">Document files</label>
+            <input type="file" accept="application/pdf,.docx,.html,.htm" multiple onChange={handleFile} className="text-sm" />
          </div>

          <div className="grid grid-cols-2 gap-4">
@ -69,4 +69,4 @@ export function IndexWizard({ onClose }: Props) {
      </div>
    </div>
  );
-} 
+}  
--- a/src/components/ui/chat-input.tsx
+++ b/src/components/ui/chat-input.tsx
@ -89,8 +89,12 @@ export function ChatInput({
        lastModified: file.lastModified
      });
      
-      // Only allow PDF files for now
-      if (file.type === 'application/pdf') {
+      if (file.type === 'application/pdf' || 
+          file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
+          file.type === 'text/html' ||
+          file.name.toLowerCase().endsWith('.html') ||
+          file.name.toLowerCase().endsWith('.htm') ||
+          file.name.toLowerCase().endsWith('.docx')) {
        newFiles.push({
          id: crypto.randomUUID(),
          name: file.name,
@ -99,7 +103,7 @@ export function ChatInput({
          file: file,
        })
      } else {
-        console.log('🔧 Frontend: File rejected - not PDF:', file.type);
+        console.log('🔧 Frontend: File rejected - unsupported format:', file.type);
      }
    }

@ -153,7 +157,7 @@ export function ChatInput({

        <div className="bg-white/5 backdrop-blur border border-white/10 rounded-2xl px-5 pt-4 pb-3 space-y-2">
          {/* Hidden file input (kept for future use) */}
-          <input ref={fileInputRef} type="file" accept=".pdf" multiple onChange={handleFileChange} className="hidden" />
+          <input ref={fileInputRef} type="file" accept=".pdf,.docx,.html,.htm" multiple onChange={handleFileChange} className="hidden" />

          {/* Textarea */}
          <textarea
@ -200,4 +204,4 @@ export function ChatInput({
      </form>
    </div>
  )
-} 
+}  
--- a/src/components/ui/empty-chat-state.tsx
+++ b/src/components/ui/empty-chat-state.tsx
@ -115,8 +115,12 @@ export function EmptyChatState({
        const newFiles: AttachedFile[] = [];
        for (let i = 0; i < files.length; i++) {
            const file = files[i];
-            // Only allow PDF files for now
-            if (file.type === 'application/pdf') {
+            if (file.type === 'application/pdf' || 
+                file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
+                file.type === 'text/html' ||
+                file.name.toLowerCase().endsWith('.html') ||
+                file.name.toLowerCase().endsWith('.htm') ||
+                file.name.toLowerCase().endsWith('.docx')) {
                newFiles.push({
                    id: crypto.randomUUID(),
                    name: file.name,
@ -220,7 +224,7 @@ export function EmptyChatState({
                    <input
                        ref={fileInputRef}
                        type="file"
-                        accept=".pdf"
+                        accept=".pdf,.docx,.html,.htm"
                        multiple
                        onChange={handleFileChange}
                        className="hidden"
@ -278,4 +282,4 @@ export function EmptyChatState({
            </div>
        </div>
    );
-} 
+}  
--- a/src/test-upload.html
+++ b/src/test-upload.html
@ -6,7 +6,7 @@
 <body>
    <h1>Test PDF Upload</h1>
    <form id="uploadForm">
-        <input type="file" id="fileInput" accept=".pdf" />
+        <input type="file" id="fileInput" accept=".pdf,.docx,.html,.htm" />
        <button type="submit">Upload PDF</button>
    </form>
    
@ -51,4 +51,4 @@
        });
    </script>
 </body>
-</html> 
+</html>