From acf6efb5a4286cb79ffc34c832dcbaaad6cea66a Mon Sep 17 00:00:00 2001 From: PromptEngineer <134474669+PromtEngineer@users.noreply.github.com> Date: Fri, 18 Jul 2025 00:26:39 -0700 Subject: [PATCH] fix: Add comprehensive NaN handling for LanceDB indexing - Add NaN and infinite value detection in QwenEmbedder and OllamaEmbedder - Implement LanceDB table creation with on_bad_vectors='drop' parameter - Add fallback strategy with on_bad_vectors='fill' and fill_value=0.0 - Add pre-filtering of chunks with invalid embeddings before indexing - Add NaN validation to LateChunkEncoder - Add detailed logging for skipped chunks and error handling - Resolves LanceDB error: 'Vector column has NaNs' during indexing This fix ensures robust handling of edge cases in embedding generation and prevents indexing failures due to invalid vector values. --- rag_system/indexing/embedders.py | 37 ++++++++++++++++++++++++-- rag_system/indexing/latechunk.py | 8 ++++++ rag_system/indexing/representations.py | 36 +++++++++++++++++++++++-- 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/rag_system/indexing/embedders.py b/rag_system/indexing/embedders.py index 81c2924..b48648f 100644 --- a/rag_system/indexing/embedders.py +++ b/rag_system/indexing/embedders.py @@ -48,7 +48,21 @@ class VectorIndexer: ]) data = [] + skipped_count = 0 + for chunk, vector in zip(chunks, embeddings): + # Check for NaN values in the vector + if np.isnan(vector).any(): + print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to NaN values in embedding") + skipped_count += 1 + continue + + # Check for infinite values in the vector + if np.isinf(vector).any(): + print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to infinite values in embedding") + skipped_count += 1 + continue + # Ensure original_text is in metadata if not already present if 'original_text' not in chunk['metadata']: chunk['metadata']['original_text'] = chunk['text'] @@ -71,6 +85,13 @@ class VectorIndexer: "metadata": json.dumps(chunk) }) + if skipped_count > 0: + print(f"⚠️ Skipped {skipped_count} chunks due to invalid embeddings (NaN or infinite values)") + + if not data: + print("❌ No valid embeddings to index after filtering out NaN/infinite values") + return + # Incremental indexing: append to existing table if present, otherwise create it db = self.db_manager.db # underlying LanceDB connection @@ -81,8 +102,20 @@ class VectorIndexer: print(f"Creating table '{table_name}' (new) and adding {len(data)} vectors...") tbl = self.db_manager.create_table(table_name, schema=schema, mode="create") - tbl.add(data) - print(f"Indexed {len(data)} vectors into table '{table_name}'.") + # Add data with NaN handling configuration + try: + tbl.add(data, on_bad_vectors='drop') + print(f"✅ Indexed {len(data)} vectors into table '{table_name}'.") + except Exception as e: + print(f"❌ Failed to add data to table: {e}") + # Fallback: try with fill strategy + try: + print("🔄 Retrying with NaN fill strategy...") + tbl.add(data, on_bad_vectors='fill', fill_value=0.0) + print(f"✅ Indexed {len(data)} vectors into table '{table_name}' (with NaN fill).") + except Exception as e2: + print(f"❌ Failed to add data even with NaN fill: {e2}") + raise # BM25Indexer is no longer needed as we are moving to LanceDB's native FTS. # class BM25Indexer: diff --git a/rag_system/indexing/latechunk.py b/rag_system/indexing/latechunk.py index ae12e12..094a524 100644 --- a/rag_system/indexing/latechunk.py +++ b/rag_system/indexing/latechunk.py @@ -76,5 +76,13 @@ class LateChunkEncoder: # Fallback: if tokenizer lost the span (e.g. due to trimming) just average CLS + SEP token_indices = [0] chunk_vec = last_hidden[token_indices].mean(dim=0).numpy().astype("float32") + + # Check for NaN or infinite values + if np.isnan(chunk_vec).any() or np.isinf(chunk_vec).any(): + print(f"⚠️ Warning: Invalid values detected in late chunk embedding for span ({start_char}, {end_char})") + # Replace invalid values with zeros + chunk_vec = np.nan_to_num(chunk_vec, nan=0.0, posinf=0.0, neginf=0.0) + print(f"🔄 Replaced invalid values with zeros") + vectors.append(chunk_vec) return vectors \ No newline at end of file diff --git a/rag_system/indexing/representations.py b/rag_system/indexing/representations.py index f257d6c..a3e5ce3 100644 --- a/rag_system/indexing/representations.py +++ b/rag_system/indexing/representations.py @@ -52,7 +52,24 @@ class QwenEmbedder(EmbeddingModel): seq_len = inputs["attention_mask"].sum(dim=1) - 1 # index of last token batch_indices = torch.arange(last_hidden.size(0), device=self.device) embeddings = last_hidden[batch_indices, seq_len] - return embeddings.cpu().numpy() + + # Convert to numpy and validate + embeddings_np = embeddings.cpu().numpy() + + # Check for NaN or infinite values + if np.isnan(embeddings_np).any(): + print(f"⚠️ Warning: NaN values detected in embeddings from {self.model_name}") + # Replace NaN values with zeros + embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0) + print(f"🔄 Replaced NaN values with zeros") + + if np.isinf(embeddings_np).any(): + print(f"⚠️ Warning: Infinite values detected in embeddings from {self.model_name}") + # Replace infinite values with zeros + embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0) + print(f"🔄 Replaced infinite values with zeros") + + return embeddings_np class EmbeddingGenerator: def __init__(self, embedding_model: EmbeddingModel, batch_size: int = 50): @@ -108,7 +125,22 @@ class OllamaEmbedder(EmbeddingModel): def create_embeddings(self, texts: List[str]): import numpy as np vectors = [self._embed_single(t) for t in texts] - return np.vstack(vectors) + embeddings_np = np.vstack(vectors) + + # Check for NaN or infinite values + if np.isnan(embeddings_np).any(): + print(f"⚠️ Warning: NaN values detected in Ollama embeddings from {self.model_name}") + # Replace NaN values with zeros + embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0) + print(f"🔄 Replaced NaN values with zeros") + + if np.isinf(embeddings_np).any(): + print(f"⚠️ Warning: Infinite values detected in Ollama embeddings from {self.model_name}") + # Replace infinite values with zeros + embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0) + print(f"🔄 Replaced infinite values with zeros") + + return embeddings_np def select_embedder(model_name: str, ollama_host: str | None = None): """Return appropriate EmbeddingModel implementation for the given name."""