localGPT/simple_create_index.sh
Devin AI d5929ce29b feat: Add support for DOCX and HTML file formats using docling
- Rename PDFConverter to DocumentConverter with multi-format support
- Add SUPPORTED_FORMATS mapping for PDF, DOCX, HTML, HTM extensions
- Update indexing pipeline to use DocumentConverter
- Update file validation across all frontend components and scripts
- Preserve existing PDF OCR detection logic
- Add format-specific conversion methods for different document types

Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
2025-07-21 20:40:39 +00:00

228 lines
6.0 KiB
Bash
Executable File

#!/bin/bash
# Simple Index Creation Script for LocalGPT RAG System
# Usage: ./simple_create_index.sh "Index Name" "path/to/document.pdf" [additional_files...]
set -e # Exit on any error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to check if a command exists
command_exists() {
command -v "$1" >/dev/null 2>&1
}
# Function to check prerequisites
check_prerequisites() {
print_status "Checking prerequisites..."
# Check Python
if ! command_exists python3; then
print_error "Python 3 is required but not installed."
exit 1
fi
# Check if we're in the right directory
if [ ! -f "run_system.py" ] || [ ! -d "rag_system" ]; then
print_error "This script must be run from the LocalGPT project root directory."
exit 1
fi
# Check if Ollama is running
if ! curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
print_error "Ollama is not running. Please start Ollama first:"
echo " ollama serve"
exit 1
fi
print_success "Prerequisites check passed"
}
# Function to validate documents
validate_documents() {
local documents=("$@")
local valid_docs=()
print_status "Validating documents..."
for doc in "${documents[@]}"; do
if [ -f "$doc" ]; then
# Check file extension
case "${doc##*.}" in
pdf|txt|docx|md|html|htm)
valid_docs+=("$doc")
print_status "✓ Valid document: $doc"
;;
*)
print_warning "Unsupported file type: $doc (skipping)"
;;
esac
else
print_warning "File not found: $doc (skipping)"
fi
done
if [ ${#valid_docs[@]} -eq 0 ]; then
print_error "No valid documents found."
exit 1
fi
echo "${valid_docs[@]}"
}
# Function to create index using Python
create_index() {
local index_name="$1"
shift
local documents=("$@")
print_status "Creating index: $index_name"
print_status "Documents: ${documents[*]}"
# Create a temporary Python script to create the index
cat > /tmp/create_index_temp.py << EOF
#!/usr/bin/env python3
import sys
import os
import json
sys.path.insert(0, os.getcwd())
from rag_system.main import PIPELINE_CONFIGS
from rag_system.pipelines.indexing_pipeline import IndexingPipeline
from rag_system.utils.ollama_client import OllamaClient
from backend.database import ChatDatabase
import uuid
def create_index_simple():
try:
# Initialize database
db = ChatDatabase()
# Create index record
index_id = db.create_index(
name="$index_name",
description="Created with simple_create_index.sh",
metadata={
"chunk_size": 512,
"chunk_overlap": 64,
"enable_enrich": True,
"enable_latechunk": True,
"retrieval_mode": "hybrid",
"created_by": "simple_create_index.sh"
}
)
# Add documents to index
documents = [${documents[@]/#/\"} ${documents[@]/%/\"}]
for doc_path in documents:
if doc_path.strip(): # Skip empty strings
filename = os.path.basename(doc_path.strip())
db.add_document_to_index(index_id, filename, os.path.abspath(doc_path.strip()))
# Initialize pipeline
config = PIPELINE_CONFIGS.get("default", {})
ollama_client = OllamaClient()
ollama_config = {
"generation_model": "qwen3:0.6b",
"embedding_model": "qwen3:0.6b"
}
pipeline = IndexingPipeline(config, ollama_client, ollama_config)
# Process documents
valid_docs = [doc.strip() for doc in documents if doc.strip() and os.path.exists(doc.strip())]
if valid_docs:
pipeline.process_documents(valid_docs)
print(f"✅ Index '{index_name}' created successfully!")
print(f"Index ID: {index_id}")
print(f"Processed {len(valid_docs)} documents")
return index_id
except Exception as e:
print(f"❌ Error creating index: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
create_index_simple()
EOF
# Run the Python script
python3 /tmp/create_index_temp.py
# Clean up
rm -f /tmp/create_index_temp.py
}
# Function to show usage
show_usage() {
echo "Usage: $0 \"Index Name\" \"path/to/document.pdf\" [additional_files...]"
echo ""
echo "Examples:"
echo " $0 \"My Documents\" \"document.pdf\""
echo " $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\""
echo " $0 \"Invoice Collection\" ./invoices/*.pdf"
echo ""
echo "Supported file types: PDF, TXT, DOCX, MD, HTML"
}
# Main script
main() {
# Check arguments
if [ $# -lt 2 ]; then
print_error "Insufficient arguments provided."
show_usage
exit 1
fi
local index_name="$1"
shift
local documents=("$@")
# Check prerequisites
check_prerequisites
# Validate documents
local valid_documents
valid_documents=($(validate_documents "${documents[@]}"))
if [ ${#valid_documents[@]} -eq 0 ]; then
print_error "No valid documents to process."
exit 1
fi
# Create the index
print_status "Starting index creation process..."
create_index "$index_name" "${valid_documents[@]}"
print_success "Index creation completed!"
print_status "You can now use the index in the LocalGPT interface."
}
# Run main function with all arguments
main "$@"