fix: make both chunking methods token-based

- Update MarkdownRecursiveChunker to use tokenizer for token-based sizing - Update DoclingChunker to use tokenizer with proper error handling - Ensure IndexingPipeline passes tokenizer_model to both chunkers - Update UI tooltips to reflect that both modes now use tokens - Keep Docling as default for enhanced granularity features - Add fallback to character-based approximation when tokenizer fails Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
2025-12-06 12:20:53 +01:00 · 2025-07-15 06:38:55 +00:00 · 2025-07-15 06:38:55 +00:00 · a13a71d247
commit a13a71d247
parent 3b648520c9
4 changed files with 67 additions and 18 deletions
--- a/rag_system/ingestion/chunking.py
+++ b/rag_system/ingestion/chunking.py
@ -1,5 +1,6 @@
 from typing import List, Dict, Any, Optional
 import re
+from transformers import AutoTokenizer

 class MarkdownRecursiveChunker:
    """
@ -7,11 +8,31 @@ class MarkdownRecursiveChunker:
    and embeds document-level metadata into each chunk.
    """

-    def __init__(self, max_chunk_size: int = 1500, min_chunk_size: int = 200):
+    def __init__(self, max_chunk_size: int = 1500, min_chunk_size: int = 200, tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"):
        self.max_chunk_size = max_chunk_size
        self.min_chunk_size = min_chunk_size
        self.split_priority = ["\n## ", "\n### ", "\n#### ", "```", "\n\n"]
        
+        repo_id = tokenizer_model
+        if "/" not in tokenizer_model and not tokenizer_model.startswith("Qwen/"):
+            repo_id = {
+                "qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
+            }.get(tokenizer_model.lower(), tokenizer_model)
+        
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
+        except Exception as e:
+            print(f"Warning: Failed to load tokenizer {repo_id}: {e}")
+            print("Falling back to character-based approximation (4 chars ≈ 1 token)")
+            self.tokenizer = None
+
+    def _token_len(self, text: str) -> int:
+        """Get token count for text using the tokenizer."""
+        if self.tokenizer is not None:
+            return len(self.tokenizer.tokenize(text))
+        else:
+            return max(1, len(text) // 4)
+    
    def _split_text(self, text: str, separators: List[str]) -> List[str]:
        final_chunks = []
        chunks_to_process = [text]
@ -19,7 +40,7 @@ class MarkdownRecursiveChunker:
        for sep in separators:
            new_chunks = []
            for chunk in chunks_to_process:
-                if len(chunk) > self.max_chunk_size:
+                if self._token_len(chunk) > self.max_chunk_size:
                    sub_chunks = re.split(f'({sep})', chunk)
                    combined = []
                    i = 0
@ -38,8 +59,19 @@ class MarkdownRecursiveChunker:
        
        final_chunks = []
        for chunk in chunks_to_process:
-            if len(chunk) > self.max_chunk_size:
-                final_chunks.extend([chunk[i:i+self.max_chunk_size] for i in range(0, len(chunk), self.max_chunk_size)])
+            if self._token_len(chunk) > self.max_chunk_size:
+                words = chunk.split()
+                current_chunk = ""
+                for word in words:
+                    test_chunk = current_chunk + " " + word if current_chunk else word
+                    if self._token_len(test_chunk) <= self.max_chunk_size:
+                        current_chunk = test_chunk
+                    else:
+                        if current_chunk:
+                            final_chunks.append(current_chunk)
+                        current_chunk = word
+                if current_chunk:
+                    final_chunks.append(current_chunk)
            else:
                final_chunks.append(chunk)

@ -65,10 +97,11 @@ class MarkdownRecursiveChunker:
        merged_chunks_text = []
        current_chunk = ""
        for chunk_text in raw_chunks:
-            if not current_chunk or len(current_chunk) + len(chunk_text) <= self.max_chunk_size:
-                current_chunk += chunk_text
-            elif len(current_chunk) < self.min_chunk_size:
-                 current_chunk += chunk_text
+            test_chunk = current_chunk + chunk_text if current_chunk else chunk_text
+            if not current_chunk or self._token_len(test_chunk) <= self.max_chunk_size:
+                current_chunk = test_chunk
+            elif self._token_len(current_chunk) < self.min_chunk_size:
+                 current_chunk = test_chunk
            else:
                merged_chunks_text.append(current_chunk)
                current_chunk = chunk_text
--- a/rag_system/ingestion/docling_chunker.py
+++ b/rag_system/ingestion/docling_chunker.py
@ -25,14 +25,24 @@ class DoclingChunker:
            repo_id = {
                "qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
            }.get(tokenizer_model.lower(), tokenizer_model)
+        
+        try:
            self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
+        except Exception as e:
+            print(f"Warning: Failed to load tokenizer {repo_id}: {e}")
+            print("Falling back to character-based approximation (4 chars ≈ 1 token)")
+            self.tokenizer = None
        # Fallback simple sentence splitter (period, question, exclamation, newline)
        self._sent_re = re.compile(r"(?<=[\.\!\?])\s+|\n+")
        self.legacy = MarkdownRecursiveChunker(max_chunk_size=10_000, min_chunk_size=100)

    # ------------------------------------------------------------------
    def _token_len(self, text: str) -> int:
+        if self.tokenizer is not None:
            return len(self.tokenizer.tokenize(text))
+        else:
+            # Fallback: approximate 4 characters per token
+            return max(1, len(text) // 4)

    def split_markdown(self, markdown: str, *, document_id: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Split one Markdown doc into chunks with max_tokens limit."""
@ -84,7 +94,11 @@ class DoclingChunker:
        metadata = metadata or {}

        def _token_len(txt: str) -> int:
+            if self.tokenizer is not None:
                return len(self.tokenizer.tokenize(txt))
+            else:
+                # Fallback: approximate 4 characters per token
+                return max(1, len(txt) // 4)

        chunks: List[Dict[str, Any]] = []
        global_idx = 0
--- a/rag_system/pipelines/indexing_pipeline.py
+++ b/rag_system/pipelines/indexing_pipeline.py
@ -39,12 +39,14 @@ class IndexingPipeline:
                print(f"⚠️  Failed to initialise DoclingChunker: {e}. Falling back to legacy chunker.")
                self.chunker = MarkdownRecursiveChunker(
                    max_chunk_size=chunk_size,
-                    min_chunk_size=min(chunk_overlap, chunk_size // 4)  # Sensible minimum
+                    min_chunk_size=min(chunk_overlap, chunk_size // 4),  # Sensible minimum
+                    tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
                )
        else:
            self.chunker = MarkdownRecursiveChunker(
                max_chunk_size=chunk_size,
-                min_chunk_size=min(chunk_overlap, chunk_size // 4)  # Sensible minimum
+                min_chunk_size=min(chunk_overlap, chunk_size // 4),  # Sensible minimum
+                tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
            )

        retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {})
--- a/src/components/IndexForm.tsx
+++ b/src/components/IndexForm.tsx
@ -118,13 +118,13 @@ export function IndexForm({ onClose, onIndexed }: Props) {
              <GlassToggle checked={enableLateChunk} onChange={setEnableLateChunk} />
            </div>
            <div className="flex items-center gap-2">
-              <span className="text-xs text-gray-400">High-recall chunking <InfoTooltip text="Sentence-level packing (Docling) for maximum recall at indexing time. Uses token-based chunking (recommended)." size={12} /></span>
+              <span className="text-xs text-gray-400">High-recall chunking <InfoTooltip text="Advanced sentence-level packing with Docling features for maximum recall. Both modes use token-based sizing." size={12} /></span>
              <GlassToggle checked={enableDoclingChunk} onChange={setEnableDoclingChunk} />
            </div>
          </div>
          <div className="grid grid-cols-2 gap-4 mt-4">
            <div>
-              <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Chunk size <InfoTooltip text="Maximum token length for each chunk when using high-recall chunking, or character length for legacy chunking." size={12} /></label>
+              <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Chunk size <InfoTooltip text="Maximum token length for each chunk. Both legacy and high-recall modes now use token-based sizing." size={12} /></label>
              <GlassInput type="number" value={chunkSize} onChange={(e) => setChunkSize(parseInt(e.target.value))} />
            </div>
            <div>