diff --git a/create_index_script.py b/create_index_script.py index d8610ea..dc7b894 100644 --- a/create_index_script.py +++ b/create_index_script.py @@ -101,7 +101,7 @@ class IndexCreator: elif choice == "2": dir_path = self.get_user_input("Enter directory path") if os.path.isdir(dir_path): - supported_extensions = ['.pdf', '.txt', '.docx', '.md'] + supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm'] found_docs = [] for ext in supported_extensions: @@ -369,4 +369,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() \ No newline at end of file diff --git a/demo_batch_indexing.py b/demo_batch_indexing.py index d925a52..06a1847 100644 --- a/demo_batch_indexing.py +++ b/demo_batch_indexing.py @@ -113,7 +113,7 @@ class BatchIndexingDemo: if os.path.exists(doc_path): # Check file extension ext = Path(doc_path).suffix.lower() - if ext in ['.pdf', '.txt', '.docx', '.md']: + if ext in ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']: valid_documents.append(doc_path) print(f" ✅ {doc_path}") else: @@ -383,4 +383,4 @@ comprehensive processing pipelines. if __name__ == "__main__": - main() \ No newline at end of file + main() \ No newline at end of file diff --git a/rag_system/ingestion/document_converter.py b/rag_system/ingestion/document_converter.py new file mode 100644 index 0000000..4f68fd8 --- /dev/null +++ b/rag_system/ingestion/document_converter.py @@ -0,0 +1,114 @@ +from typing import List, Tuple, Dict, Any +from docling.document_converter import DocumentConverter as DoclingConverter, PdfFormatOption +from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions +from docling.datamodel.base_models import InputFormat +import fitz # PyMuPDF for quick text inspection +import os + +class DocumentConverter: + """ + A class to convert various document formats to structured Markdown using the docling library. + Supports PDF, DOCX, HTML, and other formats. + """ + + # Mapping of file extensions to InputFormat + SUPPORTED_FORMATS = { + '.pdf': InputFormat.PDF, + '.docx': InputFormat.DOCX, + '.html': InputFormat.HTML, + '.htm': InputFormat.HTML, + } + + def __init__(self): + """Initializes the docling document converter with forced OCR enabled for macOS.""" + try: + # --- Converter WITHOUT OCR (fast path) --- + pipeline_no_ocr = PdfPipelineOptions() + pipeline_no_ocr.do_ocr = False + format_no_ocr = { + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr) + } + self.converter_no_ocr = DoclingConverter(format_options=format_no_ocr) + + # --- Converter WITH OCR (fallback) --- + pipeline_ocr = PdfPipelineOptions() + pipeline_ocr.do_ocr = True + ocr_options = OcrMacOptions(force_full_page_ocr=True) + pipeline_ocr.ocr_options = ocr_options + format_ocr = { + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr) + } + self.converter_ocr = DoclingConverter(format_options=format_ocr) + + self.converter_general = DoclingConverter() + + print("docling DocumentConverter(s) initialized (OCR + no-OCR + general).") + except Exception as e: + print(f"Error initializing docling DocumentConverter(s): {e}") + self.converter_no_ocr = None + self.converter_ocr = None + self.converter_general = None + + def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]: + """ + Converts a document to a single Markdown string, preserving layout and tables. + Supports PDF, DOCX, HTML, and other formats. + """ + if not (self.converter_no_ocr and self.converter_ocr and self.converter_general): + print("docling converters not available. Skipping conversion.") + return [] + + file_ext = os.path.splitext(file_path)[1].lower() + if file_ext not in self.SUPPORTED_FORMATS: + print(f"Unsupported file format: {file_ext}") + return [] + + input_format = self.SUPPORTED_FORMATS[file_ext] + + if input_format == InputFormat.PDF: + return self._convert_pdf_to_markdown(file_path) + else: + return self._convert_general_to_markdown(file_path, input_format) + + def _convert_pdf_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]: + """Convert PDF with OCR detection logic.""" + # Quick heuristic: if the PDF already contains a text layer, skip OCR for speed + def _pdf_has_text(path: str) -> bool: + try: + doc = fitz.open(path) + for page in doc: + if page.get_text("text").strip(): + return True + except Exception: + pass + return False + + use_ocr = not _pdf_has_text(pdf_path) + converter = self.converter_ocr if use_ocr else self.converter_no_ocr + ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)" + + print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...") + return self._perform_conversion(pdf_path, converter, ocr_msg) + + def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]: + """Convert non-PDF formats using general converter.""" + print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...") + return self._perform_conversion(file_path, self.converter_general, f"({input_format.name})") + + def _perform_conversion(self, file_path: str, converter, format_msg: str) -> List[Tuple[str, Dict[str, Any]]]: + """Perform the actual conversion using the specified converter.""" + pages_data = [] + try: + result = converter.convert(file_path) + markdown_content = result.document.export_to_markdown() + + metadata = {"source": file_path} + # Return the *DoclingDocument* object as third tuple element so downstream + # chunkers that understand the element tree can use it. Legacy callers that + # expect only (markdown, metadata) can simply ignore the extra value. + pages_data.append((markdown_content, metadata, result.document)) + print(f"Successfully converted {file_path} with docling {format_msg}.") + return pages_data + except Exception as e: + print(f"Error processing {file_path} with docling: {e}") + return [] diff --git a/rag_system/ingestion/pdf_converter.py b/rag_system/ingestion/pdf_converter.py deleted file mode 100644 index 5b7f603..0000000 --- a/rag_system/ingestion/pdf_converter.py +++ /dev/null @@ -1,76 +0,0 @@ -from typing import List, Tuple, Dict, Any -from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions -from docling.datamodel.base_models import InputFormat -import fitz # PyMuPDF for quick text inspection - -class PDFConverter: - """ - A class to convert PDF files to structured Markdown using the docling library. - """ - def __init__(self): - """Initializes the docling document converter with forced OCR enabled for macOS.""" - try: - # --- Converter WITHOUT OCR (fast path) --- - pipeline_no_ocr = PdfPipelineOptions() - pipeline_no_ocr.do_ocr = False - format_no_ocr = { - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr) - } - self.converter_no_ocr = DocumentConverter(format_options=format_no_ocr) - - # --- Converter WITH OCR (fallback) --- - pipeline_ocr = PdfPipelineOptions() - pipeline_ocr.do_ocr = True - ocr_options = OcrMacOptions(force_full_page_ocr=True) - pipeline_ocr.ocr_options = ocr_options - format_ocr = { - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr) - } - self.converter_ocr = DocumentConverter(format_options=format_ocr) - - print("docling DocumentConverter(s) initialized (OCR + no-OCR).") - except Exception as e: - print(f"Error initializing docling DocumentConverter(s): {e}") - self.converter_no_ocr = None - self.converter_ocr = None - - def convert_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]: - """ - Converts a PDF to a single Markdown string, preserving layout and tables. - """ - if not (self.converter_no_ocr and self.converter_ocr): - print("docling converters not available. Skipping conversion.") - return [] - - # Quick heuristic: if the PDF already contains a text layer, skip OCR for speed - def _pdf_has_text(path: str) -> bool: - try: - doc = fitz.open(path) - for page in doc: - if page.get_text("text").strip(): - return True - except Exception: - pass - return False - - use_ocr = not _pdf_has_text(pdf_path) - converter = self.converter_ocr if use_ocr else self.converter_no_ocr - ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)" - - print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...") - pages_data = [] - try: - result = converter.convert(pdf_path) - markdown_content = result.document.export_to_markdown() - - metadata = {"source": pdf_path} - # Return the *DoclingDocument* object as third tuple element so downstream - # chunkers that understand the element tree can use it. Legacy callers that - # expect only (markdown, metadata) can simply ignore the extra value. - pages_data.append((markdown_content, metadata, result.document)) - print(f"Successfully converted {pdf_path} with docling {ocr_msg}.") - return pages_data - except Exception as e: - print(f"Error processing PDF {pdf_path} with docling: {e}") - return [] diff --git a/rag_system/pipelines/indexing_pipeline.py b/rag_system/pipelines/indexing_pipeline.py index fdca054..9fc61e7 100644 --- a/rag_system/pipelines/indexing_pipeline.py +++ b/rag_system/pipelines/indexing_pipeline.py @@ -1,7 +1,7 @@ from typing import List, Dict, Any import os import networkx as nx -from rag_system.ingestion.pdf_converter import PDFConverter +from rag_system.ingestion.document_converter import DocumentConverter from rag_system.ingestion.chunking import MarkdownRecursiveChunker from rag_system.indexing.representations import EmbeddingGenerator, select_embedder from rag_system.indexing.embedders import LanceDBManager, VectorIndexer @@ -15,7 +15,7 @@ class IndexingPipeline: self.config = config self.llm_client = ollama_client self.ollama_config = ollama_config - self.pdf_converter = PDFConverter() + self.document_converter = DocumentConverter() # Chunker selection: docling (token-based) or legacy (character-based) chunker_mode = config.get("chunker_mode", "docling") @@ -157,7 +157,7 @@ class IndexingPipeline: document_id = os.path.basename(file_path) print(f"Processing: {document_id}") - pages_data = self.pdf_converter.convert_to_markdown(file_path) + pages_data = self.document_converter.convert_to_markdown(file_path) file_chunks = [] for tpl in pages_data: diff --git a/simple_create_index.sh b/simple_create_index.sh index d95f47c..ebe5b84 100755 --- a/simple_create_index.sh +++ b/simple_create_index.sh @@ -71,7 +71,7 @@ validate_documents() { if [ -f "$doc" ]; then # Check file extension case "${doc##*.}" in - pdf|txt|docx|md) + pdf|txt|docx|md|html|htm) valid_docs+=("$doc") print_status "✓ Valid document: $doc" ;; @@ -188,7 +188,7 @@ show_usage() { echo " $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\"" echo " $0 \"Invoice Collection\" ./invoices/*.pdf" echo "" - echo "Supported file types: PDF, TXT, DOCX, MD" + echo "Supported file types: PDF, TXT, DOCX, MD, HTML" } # Main script @@ -225,4 +225,4 @@ main() { } # Run main function with all arguments -main "$@" \ No newline at end of file +main "$@" \ No newline at end of file diff --git a/src/components/IndexForm.tsx b/src/components/IndexForm.tsx index 1f1365f..072ef27 100644 --- a/src/components/IndexForm.tsx +++ b/src/components/IndexForm.tsx @@ -98,8 +98,8 @@ export function IndexForm({ onClose, onIndexed }: Props) { onDrop={(e)=>{e.preventDefault(); if(e.dataTransfer.files) setFiles(e.dataTransfer.files)}} > - Drag & Drop PDFs here or click to browse - setFiles(e.target.files)} /> + Drag & Drop documents here or click to browse + setFiles(e.target.files)} /> {files &&
{files.length} file(s) selected
} @@ -220,4 +220,4 @@ export function IndexForm({ onClose, onIndexed }: Props) { ); -} \ No newline at end of file +} \ No newline at end of file diff --git a/src/components/IndexWizard.tsx b/src/components/IndexWizard.tsx index 1bbd361..1eb37f4 100644 --- a/src/components/IndexWizard.tsx +++ b/src/components/IndexWizard.tsx @@ -24,8 +24,8 @@ export function IndexWizard({ onClose }: Props) {