localGPT/create_index_script.py

#!/usr/bin/env python3
"""
Interactive Index Creation Script for LocalGPT RAG System

This script provides a user-friendly interface for creating document indexes
using the LocalGPT RAG system. It supports both single documents and batch
processing of multiple documents.

Usage:
    python create_index_script.py
    python create_index_script.py --batch
    python create_index_script.py --config custom_config.json
"""

import os
import sys
import json
import argparse
from typing import List, Optional
from pathlib import Path

# Add the project root to the path so we can import rag_system modules
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

try:
    from rag_system.main import PIPELINE_CONFIGS, get_agent
    from rag_system.pipelines.indexing_pipeline import IndexingPipeline
    from rag_system.utils.ollama_client import OllamaClient
    from backend.database import ChatDatabase
except ImportError as e:
    print(f"❌ Error importing required modules: {e}")
    print("Please ensure you're running this script from the project root directory.")
    sys.exit(1)


class IndexCreator:
    """Interactive index creation utility."""

    def __init__(self, config_path: Optional[str] = None):
        """Initialize the index creator with optional custom configuration."""
        self.db = ChatDatabase()
        self.config = self._load_config(config_path)

        # Initialize Ollama client
        self.ollama_client = OllamaClient()
        self.ollama_config = {
            "generation_model": "qwen3:0.6b",
            "embedding_model": "qwen3:0.6b"
        }

        # Initialize indexing pipeline
        self.pipeline = IndexingPipeline(
            self.config,
            self.ollama_client,
            self.ollama_config
        )

    def _load_config(self, config_path: Optional[str] = None) -> dict:
        """Load configuration from file or use default."""
        if config_path and os.path.exists(config_path):
            try:
                with open(config_path, 'r') as f:
                    return json.load(f)
            except Exception as e:
                print(f"⚠️  Error loading config from {config_path}: {e}")
                print("Using default configuration...")

        return PIPELINE_CONFIGS.get("default", {})

    def get_user_input(self, prompt: str, default: str = "") -> str:
        """Get user input with optional default value."""
        if default:
            user_input = input(f"{prompt} [{default}]: ").strip()
            return user_input if user_input else default
        return input(f"{prompt}: ").strip()

    def select_documents(self) -> List[str]:
        """Interactive document selection."""
        print("\n📁 Document Selection")
        print("=" * 50)

        documents = []

        while True:
            print("\nOptions:")
            print("1. Add a single document")
            print("2. Add all documents from a directory")
            print("3. Finish and proceed with selected documents")
            print("4. Show selected documents")

            choice = self.get_user_input("Select an option (1-4)", "1")

            if choice == "1":
                doc_path = self.get_user_input("Enter document path")
                if os.path.exists(doc_path):
                    documents.append(os.path.abspath(doc_path))
                    print(f"✅ Added: {doc_path}")
                else:
                    print(f"❌ File not found: {doc_path}")

            elif choice == "2":
                dir_path = self.get_user_input("Enter directory path")
                if os.path.isdir(dir_path):
                    supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']
                    found_docs = []

                    for ext in supported_extensions:
                        found_docs.extend(Path(dir_path).glob(f"*{ext}"))
                        found_docs.extend(Path(dir_path).glob(f"**/*{ext}"))

                    if found_docs:
                        print(f"Found {len(found_docs)} documents:")
                        for doc in found_docs:
                            print(f"  - {doc}")

                        if self.get_user_input("Add all these documents? (y/n)", "y").lower() == 'y':
                            documents.extend([str(doc.absolute()) for doc in found_docs])
                            print(f"✅ Added {len(found_docs)} documents")
                    else:
                        print("❌ No supported documents found in directory")
                else:
                    print(f"❌ Directory not found: {dir_path}")

            elif choice == "3":
                if documents:
                    break
                else:
                    print("❌ No documents selected. Please add at least one document.")

            elif choice == "4":
                if documents:
                    print(f"\n📄 Selected documents ({len(documents)}):")
                    for i, doc in enumerate(documents, 1):
                        print(f"  {i}. {doc}")
                else:
                    print("No documents selected yet.")

            else:
                print("Invalid choice. Please select 1-4.")

        return documents

    def configure_processing(self) -> dict:
        """Interactive processing configuration."""
        print("\n⚙️  Processing Configuration")
        print("=" * 50)

        print("Configure how documents will be processed:")

        # Basic settings
        chunk_size = int(self.get_user_input("Chunk size", "512"))
        chunk_overlap = int(self.get_user_input("Chunk overlap", "64"))

        # Advanced settings
        print("\nAdvanced options:")
        enable_enrich = self.get_user_input("Enable contextual enrichment? (y/n)", "y").lower() == 'y'
        enable_latechunk = self.get_user_input("Enable late chunking? (y/n)", "y").lower() == 'y'
        enable_docling = self.get_user_input("Enable Docling chunking? (y/n)", "y").lower() == 'y'

        # Model selection
        print("\nModel Configuration:")
        embedding_model = self.get_user_input("Embedding model", "Qwen/Qwen3-Embedding-0.6B")
        generation_model = self.get_user_input("Generation model", "qwen3:0.6b")

        return {
            "chunk_size": chunk_size,
            "chunk_overlap": chunk_overlap,
            "enable_enrich": enable_enrich,
            "enable_latechunk": enable_latechunk,
            "enable_docling": enable_docling,
            "embedding_model": embedding_model,
            "generation_model": generation_model,
            "retrieval_mode": "hybrid",
            "window_size": 2
        }

    def create_index_interactive(self) -> None:
        """Run the interactive index creation process."""
        print("🚀 LocalGPT Index Creation Tool")
        print("=" * 50)

        # Get index details
        index_name = self.get_user_input("Enter index name")
        index_description = self.get_user_input("Enter index description (optional)")

        # Select documents
        documents = self.select_documents()

        # Configure processing
        processing_config = self.configure_processing()

        # Confirm creation
        print("\n📋 Index Summary")
        print("=" * 50)
        print(f"Name: {index_name}")
        print(f"Description: {index_description or 'None'}")
        print(f"Documents: {len(documents)}")
        print(f"Chunk size: {processing_config['chunk_size']}")
        print(f"Enrichment: {'Enabled' if processing_config['enable_enrich'] else 'Disabled'}")
        print(f"Embedding model: {processing_config['embedding_model']}")

        if self.get_user_input("\nProceed with index creation? (y/n)", "y").lower() != 'y':
            print("❌ Index creation cancelled.")
            return

        # Create the index
        try:
            print("\n🔥 Creating index...")

            # Create index record in database
            index_id = self.db.create_index(
                name=index_name,
                description=index_description,
                metadata=processing_config
            )

            # Add documents to index
            for doc_path in documents:
                filename = os.path.basename(doc_path)
                self.db.add_document_to_index(index_id, filename, doc_path)

            # Process documents through pipeline
            print("📚 Processing documents...")
            self.pipeline.process_documents(documents)

            print(f"\n✅ Index '{index_name}' created successfully!")
            print(f"Index ID: {index_id}")
            print(f"Processed {len(documents)} documents")

            # Test the index
            if self.get_user_input("\nTest the index with a sample query? (y/n)", "y").lower() == 'y':
                self.test_index(index_id)

        except Exception as e:
            print(f"❌ Error creating index: {e}")
            import traceback
            traceback.print_exc()

    def test_index(self, index_id: str) -> None:
        """Test the created index with a sample query."""
        try:
            print("\n🧪 Testing Index")
            print("=" * 50)

            # Get agent for testing
            agent = get_agent("default")

            # Test query
            test_query = self.get_user_input("Enter a test query", "What is this document about?")

            print(f"\nProcessing query: {test_query}")
            response = agent.run(test_query, table_name=f"text_pages_{index_id}")

            print(f"\n🤖 Response:")
            print(response)

        except Exception as e:
            print(f"❌ Error testing index: {e}")

    def batch_create_from_config(self, config_file: str) -> None:
        """Create index from batch configuration file."""
        try:
            with open(config_file, 'r') as f:
                batch_config = json.load(f)

            index_name = batch_config.get("index_name", "Batch Index")
            index_description = batch_config.get("index_description", "")
            documents = batch_config.get("documents", [])
            processing_config = batch_config.get("processing", {})

            if not documents:
                print("❌ No documents specified in batch configuration")
                return

            # Validate documents exist
            valid_documents = []
            for doc_path in documents:
                if os.path.exists(doc_path):
                    valid_documents.append(doc_path)
                else:
                    print(f"⚠️  Document not found: {doc_path}")

            if not valid_documents:
                print("❌ No valid documents found")
                return

            print(f"🚀 Creating batch index: {index_name}")
            print(f"📄 Processing {len(valid_documents)} documents...")

            # Create index
            index_id = self.db.create_index(
                name=index_name,
                description=index_description,
                metadata=processing_config
            )

            # Add documents
            for doc_path in valid_documents:
                filename = os.path.basename(doc_path)
                self.db.add_document_to_index(index_id, filename, doc_path)

            # Process documents
            self.pipeline.process_documents(valid_documents)

            print(f"✅ Batch index '{index_name}' created successfully!")
            print(f"Index ID: {index_id}")

        except Exception as e:
            print(f"❌ Error creating batch index: {e}")
            import traceback
            traceback.print_exc()


def create_sample_batch_config():
    """Create a sample batch configuration file."""
    sample_config = {
        "index_name": "Sample Batch Index",
        "index_description": "Example batch index configuration",
        "documents": [
            "./rag_system/documents/invoice_1039.pdf",
            "./rag_system/documents/invoice_1041.pdf"
        ],
        "processing": {
            "chunk_size": 512,
            "chunk_overlap": 64,
            "enable_enrich": True,
            "enable_latechunk": True,
            "enable_docling": True,
            "embedding_model": "Qwen/Qwen3-Embedding-0.6B",
            "generation_model": "qwen3:0.6b",
            "retrieval_mode": "hybrid",
            "window_size": 2
        }
    }

    with open("batch_indexing_config.json", "w") as f:
        json.dump(sample_config, f, indent=2)

    print("📄 Sample batch configuration created: batch_indexing_config.json")


def main():
    """Main entry point for the script."""
    parser = argparse.ArgumentParser(description="LocalGPT Index Creation Tool")
    parser.add_argument("--batch", help="Batch configuration file", type=str)
    parser.add_argument("--config", help="Custom pipeline configuration file", type=str)
    parser.add_argument("--create-sample", action="store_true", help="Create sample batch config")

    args = parser.parse_args()

    if args.create_sample:
        create_sample_batch_config()
        return

    try:
        creator = IndexCreator(config_path=args.config)

        if args.batch:
            creator.batch_create_from_config(args.batch)
        else:
            creator.create_index_interactive()

    except KeyboardInterrupt:
        print("\n\n❌ Operation cancelled by user.")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()