Merge pull request #871 from PromtEngineer/fix/lancedb-nan-handling

Fix: Add comprehensive NaN handling for LanceDB indexing
This commit is contained in:
PromptEngineer 2025-07-18 00:56:47 -07:00 committed by GitHub
commit a4e5087aef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 77 additions and 4 deletions

View File

@ -48,7 +48,21 @@ class VectorIndexer:
])
data = []
skipped_count = 0
for chunk, vector in zip(chunks, embeddings):
# Check for NaN values in the vector
if np.isnan(vector).any():
print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to NaN values in embedding")
skipped_count += 1
continue
# Check for infinite values in the vector
if np.isinf(vector).any():
print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to infinite values in embedding")
skipped_count += 1
continue
# Ensure original_text is in metadata if not already present
if 'original_text' not in chunk['metadata']:
chunk['metadata']['original_text'] = chunk['text']
@ -71,6 +85,13 @@ class VectorIndexer:
"metadata": json.dumps(chunk)
})
if skipped_count > 0:
print(f"⚠️ Skipped {skipped_count} chunks due to invalid embeddings (NaN or infinite values)")
if not data:
print("❌ No valid embeddings to index after filtering out NaN/infinite values")
return
# Incremental indexing: append to existing table if present, otherwise create it
db = self.db_manager.db # underlying LanceDB connection
@ -81,8 +102,20 @@ class VectorIndexer:
print(f"Creating table '{table_name}' (new) and adding {len(data)} vectors...")
tbl = self.db_manager.create_table(table_name, schema=schema, mode="create")
tbl.add(data)
print(f"Indexed {len(data)} vectors into table '{table_name}'.")
# Add data with NaN handling configuration
try:
tbl.add(data, on_bad_vectors='drop')
print(f"✅ Indexed {len(data)} vectors into table '{table_name}'.")
except Exception as e:
print(f"❌ Failed to add data to table: {e}")
# Fallback: try with fill strategy
try:
print("🔄 Retrying with NaN fill strategy...")
tbl.add(data, on_bad_vectors='fill', fill_value=0.0)
print(f"✅ Indexed {len(data)} vectors into table '{table_name}' (with NaN fill).")
except Exception as e2:
print(f"❌ Failed to add data even with NaN fill: {e2}")
raise
# BM25Indexer is no longer needed as we are moving to LanceDB's native FTS.
# class BM25Indexer:

View File

@ -76,5 +76,13 @@ class LateChunkEncoder:
# Fallback: if tokenizer lost the span (e.g. due to trimming) just average CLS + SEP
token_indices = [0]
chunk_vec = last_hidden[token_indices].mean(dim=0).numpy().astype("float32")
# Check for NaN or infinite values
if np.isnan(chunk_vec).any() or np.isinf(chunk_vec).any():
print(f"⚠️ Warning: Invalid values detected in late chunk embedding for span ({start_char}, {end_char})")
# Replace invalid values with zeros
chunk_vec = np.nan_to_num(chunk_vec, nan=0.0, posinf=0.0, neginf=0.0)
print(f"🔄 Replaced invalid values with zeros")
vectors.append(chunk_vec)
return vectors

View File

@ -52,7 +52,24 @@ class QwenEmbedder(EmbeddingModel):
seq_len = inputs["attention_mask"].sum(dim=1) - 1 # index of last token
batch_indices = torch.arange(last_hidden.size(0), device=self.device)
embeddings = last_hidden[batch_indices, seq_len]
return embeddings.cpu().numpy()
# Convert to numpy and validate
embeddings_np = embeddings.cpu().numpy()
# Check for NaN or infinite values
if np.isnan(embeddings_np).any():
print(f"⚠️ Warning: NaN values detected in embeddings from {self.model_name}")
# Replace NaN values with zeros
embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
print(f"🔄 Replaced NaN values with zeros")
if np.isinf(embeddings_np).any():
print(f"⚠️ Warning: Infinite values detected in embeddings from {self.model_name}")
# Replace infinite values with zeros
embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
print(f"🔄 Replaced infinite values with zeros")
return embeddings_np
class EmbeddingGenerator:
def __init__(self, embedding_model: EmbeddingModel, batch_size: int = 50):
@ -108,7 +125,22 @@ class OllamaEmbedder(EmbeddingModel):
def create_embeddings(self, texts: List[str]):
import numpy as np
vectors = [self._embed_single(t) for t in texts]
return np.vstack(vectors)
embeddings_np = np.vstack(vectors)
# Check for NaN or infinite values
if np.isnan(embeddings_np).any():
print(f"⚠️ Warning: NaN values detected in Ollama embeddings from {self.model_name}")
# Replace NaN values with zeros
embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
print(f"🔄 Replaced NaN values with zeros")
if np.isinf(embeddings_np).any():
print(f"⚠️ Warning: Infinite values detected in Ollama embeddings from {self.model_name}")
# Replace infinite values with zeros
embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
print(f"🔄 Replaced infinite values with zeros")
return embeddings_np
def select_embedder(model_name: str, ollama_host: str | None = None):
"""Return appropriate EmbeddingModel implementation for the given name."""