fix: make both chunking methods token-based

- Update MarkdownRecursiveChunker to use tokenizer for token-based sizing
- Update DoclingChunker to use tokenizer with proper error handling
- Ensure IndexingPipeline passes tokenizer_model to both chunkers
- Update UI tooltips to reflect that both modes now use tokens
- Keep Docling as default for enhanced granularity features
- Add fallback to character-based approximation when tokenizer fails

Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
This commit is contained in:
Devin AI 2025-07-15 06:38:55 +00:00
parent 3b648520c9
commit a13a71d247
4 changed files with 67 additions and 18 deletions

View File

@ -1,5 +1,6 @@
from typing import List, Dict, Any, Optional
import re
from transformers import AutoTokenizer
class MarkdownRecursiveChunker:
"""
@ -7,11 +8,31 @@ class MarkdownRecursiveChunker:
and embeds document-level metadata into each chunk.
"""
def __init__(self, max_chunk_size: int = 1500, min_chunk_size: int = 200):
def __init__(self, max_chunk_size: int = 1500, min_chunk_size: int = 200, tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"):
self.max_chunk_size = max_chunk_size
self.min_chunk_size = min_chunk_size
self.split_priority = ["\n## ", "\n### ", "\n#### ", "```", "\n\n"]
repo_id = tokenizer_model
if "/" not in tokenizer_model and not tokenizer_model.startswith("Qwen/"):
repo_id = {
"qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
}.get(tokenizer_model.lower(), tokenizer_model)
try:
self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
except Exception as e:
print(f"Warning: Failed to load tokenizer {repo_id}: {e}")
print("Falling back to character-based approximation (4 chars ≈ 1 token)")
self.tokenizer = None
def _token_len(self, text: str) -> int:
"""Get token count for text using the tokenizer."""
if self.tokenizer is not None:
return len(self.tokenizer.tokenize(text))
else:
return max(1, len(text) // 4)
def _split_text(self, text: str, separators: List[str]) -> List[str]:
final_chunks = []
chunks_to_process = [text]
@ -19,7 +40,7 @@ class MarkdownRecursiveChunker:
for sep in separators:
new_chunks = []
for chunk in chunks_to_process:
if len(chunk) > self.max_chunk_size:
if self._token_len(chunk) > self.max_chunk_size:
sub_chunks = re.split(f'({sep})', chunk)
combined = []
i = 0
@ -38,8 +59,19 @@ class MarkdownRecursiveChunker:
final_chunks = []
for chunk in chunks_to_process:
if len(chunk) > self.max_chunk_size:
final_chunks.extend([chunk[i:i+self.max_chunk_size] for i in range(0, len(chunk), self.max_chunk_size)])
if self._token_len(chunk) > self.max_chunk_size:
words = chunk.split()
current_chunk = ""
for word in words:
test_chunk = current_chunk + " " + word if current_chunk else word
if self._token_len(test_chunk) <= self.max_chunk_size:
current_chunk = test_chunk
else:
if current_chunk:
final_chunks.append(current_chunk)
current_chunk = word
if current_chunk:
final_chunks.append(current_chunk)
else:
final_chunks.append(chunk)
@ -65,10 +97,11 @@ class MarkdownRecursiveChunker:
merged_chunks_text = []
current_chunk = ""
for chunk_text in raw_chunks:
if not current_chunk or len(current_chunk) + len(chunk_text) <= self.max_chunk_size:
current_chunk += chunk_text
elif len(current_chunk) < self.min_chunk_size:
current_chunk += chunk_text
test_chunk = current_chunk + chunk_text if current_chunk else chunk_text
if not current_chunk or self._token_len(test_chunk) <= self.max_chunk_size:
current_chunk = test_chunk
elif self._token_len(current_chunk) < self.min_chunk_size:
current_chunk = test_chunk
else:
merged_chunks_text.append(current_chunk)
current_chunk = chunk_text

View File

@ -25,14 +25,24 @@ class DoclingChunker:
repo_id = {
"qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
}.get(tokenizer_model.lower(), tokenizer_model)
try:
self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
except Exception as e:
print(f"Warning: Failed to load tokenizer {repo_id}: {e}")
print("Falling back to character-based approximation (4 chars ≈ 1 token)")
self.tokenizer = None
# Fallback simple sentence splitter (period, question, exclamation, newline)
self._sent_re = re.compile(r"(?<=[\.\!\?])\s+|\n+")
self.legacy = MarkdownRecursiveChunker(max_chunk_size=10_000, min_chunk_size=100)
# ------------------------------------------------------------------
def _token_len(self, text: str) -> int:
if self.tokenizer is not None:
return len(self.tokenizer.tokenize(text))
else:
# Fallback: approximate 4 characters per token
return max(1, len(text) // 4)
def split_markdown(self, markdown: str, *, document_id: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Split one Markdown doc into chunks with max_tokens limit."""
@ -84,7 +94,11 @@ class DoclingChunker:
metadata = metadata or {}
def _token_len(txt: str) -> int:
if self.tokenizer is not None:
return len(self.tokenizer.tokenize(txt))
else:
# Fallback: approximate 4 characters per token
return max(1, len(txt) // 4)
chunks: List[Dict[str, Any]] = []
global_idx = 0

View File

@ -39,12 +39,14 @@ class IndexingPipeline:
print(f"⚠️ Failed to initialise DoclingChunker: {e}. Falling back to legacy chunker.")
self.chunker = MarkdownRecursiveChunker(
max_chunk_size=chunk_size,
min_chunk_size=min(chunk_overlap, chunk_size // 4) # Sensible minimum
min_chunk_size=min(chunk_overlap, chunk_size // 4), # Sensible minimum
tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
)
else:
self.chunker = MarkdownRecursiveChunker(
max_chunk_size=chunk_size,
min_chunk_size=min(chunk_overlap, chunk_size // 4) # Sensible minimum
min_chunk_size=min(chunk_overlap, chunk_size // 4), # Sensible minimum
tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
)
retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {})

View File

@ -118,13 +118,13 @@ export function IndexForm({ onClose, onIndexed }: Props) {
<GlassToggle checked={enableLateChunk} onChange={setEnableLateChunk} />
</div>
<div className="flex items-center gap-2">
<span className="text-xs text-gray-400">High-recall chunking <InfoTooltip text="Sentence-level packing (Docling) for maximum recall at indexing time. Uses token-based chunking (recommended)." size={12} /></span>
<span className="text-xs text-gray-400">High-recall chunking <InfoTooltip text="Advanced sentence-level packing with Docling features for maximum recall. Both modes use token-based sizing." size={12} /></span>
<GlassToggle checked={enableDoclingChunk} onChange={setEnableDoclingChunk} />
</div>
</div>
<div className="grid grid-cols-2 gap-4 mt-4">
<div>
<label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Chunk size <InfoTooltip text="Maximum token length for each chunk when using high-recall chunking, or character length for legacy chunking." size={12} /></label>
<label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Chunk size <InfoTooltip text="Maximum token length for each chunk. Both legacy and high-recall modes now use token-based sizing." size={12} /></label>
<GlassInput type="number" value={chunkSize} onChange={(e) => setChunkSize(parseInt(e.target.value))} />
</div>
<div>