mirror of
https://github.com/zebrajr/localGPT.git
synced 2025-12-06 00:20:19 +01:00
fix: make both chunking methods token-based
- Update MarkdownRecursiveChunker to use tokenizer for token-based sizing - Update DoclingChunker to use tokenizer with proper error handling - Ensure IndexingPipeline passes tokenizer_model to both chunkers - Update UI tooltips to reflect that both modes now use tokens - Keep Docling as default for enhanced granularity features - Add fallback to character-based approximation when tokenizer fails Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
This commit is contained in:
parent
3b648520c9
commit
a13a71d247
|
|
@ -1,5 +1,6 @@
|
|||
from typing import List, Dict, Any, Optional
|
||||
import re
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
class MarkdownRecursiveChunker:
|
||||
"""
|
||||
|
|
@ -7,11 +8,31 @@ class MarkdownRecursiveChunker:
|
|||
and embeds document-level metadata into each chunk.
|
||||
"""
|
||||
|
||||
def __init__(self, max_chunk_size: int = 1500, min_chunk_size: int = 200):
|
||||
def __init__(self, max_chunk_size: int = 1500, min_chunk_size: int = 200, tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"):
|
||||
self.max_chunk_size = max_chunk_size
|
||||
self.min_chunk_size = min_chunk_size
|
||||
self.split_priority = ["\n## ", "\n### ", "\n#### ", "```", "\n\n"]
|
||||
|
||||
repo_id = tokenizer_model
|
||||
if "/" not in tokenizer_model and not tokenizer_model.startswith("Qwen/"):
|
||||
repo_id = {
|
||||
"qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
|
||||
}.get(tokenizer_model.lower(), tokenizer_model)
|
||||
|
||||
try:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to load tokenizer {repo_id}: {e}")
|
||||
print("Falling back to character-based approximation (4 chars ≈ 1 token)")
|
||||
self.tokenizer = None
|
||||
|
||||
def _token_len(self, text: str) -> int:
|
||||
"""Get token count for text using the tokenizer."""
|
||||
if self.tokenizer is not None:
|
||||
return len(self.tokenizer.tokenize(text))
|
||||
else:
|
||||
return max(1, len(text) // 4)
|
||||
|
||||
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
||||
final_chunks = []
|
||||
chunks_to_process = [text]
|
||||
|
|
@ -19,7 +40,7 @@ class MarkdownRecursiveChunker:
|
|||
for sep in separators:
|
||||
new_chunks = []
|
||||
for chunk in chunks_to_process:
|
||||
if len(chunk) > self.max_chunk_size:
|
||||
if self._token_len(chunk) > self.max_chunk_size:
|
||||
sub_chunks = re.split(f'({sep})', chunk)
|
||||
combined = []
|
||||
i = 0
|
||||
|
|
@ -38,8 +59,19 @@ class MarkdownRecursiveChunker:
|
|||
|
||||
final_chunks = []
|
||||
for chunk in chunks_to_process:
|
||||
if len(chunk) > self.max_chunk_size:
|
||||
final_chunks.extend([chunk[i:i+self.max_chunk_size] for i in range(0, len(chunk), self.max_chunk_size)])
|
||||
if self._token_len(chunk) > self.max_chunk_size:
|
||||
words = chunk.split()
|
||||
current_chunk = ""
|
||||
for word in words:
|
||||
test_chunk = current_chunk + " " + word if current_chunk else word
|
||||
if self._token_len(test_chunk) <= self.max_chunk_size:
|
||||
current_chunk = test_chunk
|
||||
else:
|
||||
if current_chunk:
|
||||
final_chunks.append(current_chunk)
|
||||
current_chunk = word
|
||||
if current_chunk:
|
||||
final_chunks.append(current_chunk)
|
||||
else:
|
||||
final_chunks.append(chunk)
|
||||
|
||||
|
|
@ -65,10 +97,11 @@ class MarkdownRecursiveChunker:
|
|||
merged_chunks_text = []
|
||||
current_chunk = ""
|
||||
for chunk_text in raw_chunks:
|
||||
if not current_chunk or len(current_chunk) + len(chunk_text) <= self.max_chunk_size:
|
||||
current_chunk += chunk_text
|
||||
elif len(current_chunk) < self.min_chunk_size:
|
||||
current_chunk += chunk_text
|
||||
test_chunk = current_chunk + chunk_text if current_chunk else chunk_text
|
||||
if not current_chunk or self._token_len(test_chunk) <= self.max_chunk_size:
|
||||
current_chunk = test_chunk
|
||||
elif self._token_len(current_chunk) < self.min_chunk_size:
|
||||
current_chunk = test_chunk
|
||||
else:
|
||||
merged_chunks_text.append(current_chunk)
|
||||
current_chunk = chunk_text
|
||||
|
|
@ -118,4 +151,4 @@ if __name__ == '__main__':
|
|||
print(f"Chunk ID: {chunk['chunk_id']}")
|
||||
print(f"Text: '{chunk['text']}'")
|
||||
print(f"Metadata: {chunk['metadata']}")
|
||||
print("-" * 20)
|
||||
print("-" * 20)
|
||||
|
|
|
|||
|
|
@ -25,14 +25,24 @@ class DoclingChunker:
|
|||
repo_id = {
|
||||
"qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
|
||||
}.get(tokenizer_model.lower(), tokenizer_model)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
|
||||
|
||||
try:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to load tokenizer {repo_id}: {e}")
|
||||
print("Falling back to character-based approximation (4 chars ≈ 1 token)")
|
||||
self.tokenizer = None
|
||||
# Fallback simple sentence splitter (period, question, exclamation, newline)
|
||||
self._sent_re = re.compile(r"(?<=[\.\!\?])\s+|\n+")
|
||||
self.legacy = MarkdownRecursiveChunker(max_chunk_size=10_000, min_chunk_size=100)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def _token_len(self, text: str) -> int:
|
||||
return len(self.tokenizer.tokenize(text))
|
||||
if self.tokenizer is not None:
|
||||
return len(self.tokenizer.tokenize(text))
|
||||
else:
|
||||
# Fallback: approximate 4 characters per token
|
||||
return max(1, len(text) // 4)
|
||||
|
||||
def split_markdown(self, markdown: str, *, document_id: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Split one Markdown doc into chunks with max_tokens limit."""
|
||||
|
|
@ -84,7 +94,11 @@ class DoclingChunker:
|
|||
metadata = metadata or {}
|
||||
|
||||
def _token_len(txt: str) -> int:
|
||||
return len(self.tokenizer.tokenize(txt))
|
||||
if self.tokenizer is not None:
|
||||
return len(self.tokenizer.tokenize(txt))
|
||||
else:
|
||||
# Fallback: approximate 4 characters per token
|
||||
return max(1, len(txt) // 4)
|
||||
|
||||
chunks: List[Dict[str, Any]] = []
|
||||
global_idx = 0
|
||||
|
|
@ -233,4 +247,4 @@ class DoclingChunker:
|
|||
|
||||
# Public API expected by IndexingPipeline --------------------------------
|
||||
def chunk(self, text: str, document_id: str, document_metadata: Dict[str, Any] | None = None) -> List[Dict[str, Any]]:
|
||||
return self.split_markdown(text, document_id=document_id, metadata=document_metadata or {})
|
||||
return self.split_markdown(text, document_id=document_id, metadata=document_metadata or {})
|
||||
|
|
@ -39,12 +39,14 @@ class IndexingPipeline:
|
|||
print(f"⚠️ Failed to initialise DoclingChunker: {e}. Falling back to legacy chunker.")
|
||||
self.chunker = MarkdownRecursiveChunker(
|
||||
max_chunk_size=chunk_size,
|
||||
min_chunk_size=min(chunk_overlap, chunk_size // 4) # Sensible minimum
|
||||
min_chunk_size=min(chunk_overlap, chunk_size // 4), # Sensible minimum
|
||||
tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
|
||||
)
|
||||
else:
|
||||
self.chunker = MarkdownRecursiveChunker(
|
||||
max_chunk_size=chunk_size,
|
||||
min_chunk_size=min(chunk_overlap, chunk_size // 4) # Sensible minimum
|
||||
min_chunk_size=min(chunk_overlap, chunk_size // 4), # Sensible minimum
|
||||
tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
|
||||
)
|
||||
|
||||
retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {})
|
||||
|
|
|
|||
|
|
@ -118,13 +118,13 @@ export function IndexForm({ onClose, onIndexed }: Props) {
|
|||
<GlassToggle checked={enableLateChunk} onChange={setEnableLateChunk} />
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-xs text-gray-400">High-recall chunking <InfoTooltip text="Sentence-level packing (Docling) for maximum recall at indexing time. Uses token-based chunking (recommended)." size={12} /></span>
|
||||
<span className="text-xs text-gray-400">High-recall chunking <InfoTooltip text="Advanced sentence-level packing with Docling features for maximum recall. Both modes use token-based sizing." size={12} /></span>
|
||||
<GlassToggle checked={enableDoclingChunk} onChange={setEnableDoclingChunk} />
|
||||
</div>
|
||||
</div>
|
||||
<div className="grid grid-cols-2 gap-4 mt-4">
|
||||
<div>
|
||||
<label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Chunk size <InfoTooltip text="Maximum token length for each chunk when using high-recall chunking, or character length for legacy chunking." size={12} /></label>
|
||||
<label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Chunk size <InfoTooltip text="Maximum token length for each chunk. Both legacy and high-recall modes now use token-based sizing." size={12} /></label>
|
||||
<GlassInput type="number" value={chunkSize} onChange={(e) => setChunkSize(parseInt(e.target.value))} />
|
||||
</div>
|
||||
<div>
|
||||
|
|
@ -220,4 +220,4 @@ export function IndexForm({ onClose, onIndexed }: Props) {
|
|||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user