mirror of
https://github.com/zebrajr/localGPT.git
synced 2025-12-06 00:20:19 +01:00
Merge pull request #871 from PromtEngineer/fix/lancedb-nan-handling
Fix: Add comprehensive NaN handling for LanceDB indexing
This commit is contained in:
commit
a4e5087aef
|
|
@ -48,7 +48,21 @@ class VectorIndexer:
|
|||
])
|
||||
|
||||
data = []
|
||||
skipped_count = 0
|
||||
|
||||
for chunk, vector in zip(chunks, embeddings):
|
||||
# Check for NaN values in the vector
|
||||
if np.isnan(vector).any():
|
||||
print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to NaN values in embedding")
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Check for infinite values in the vector
|
||||
if np.isinf(vector).any():
|
||||
print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to infinite values in embedding")
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Ensure original_text is in metadata if not already present
|
||||
if 'original_text' not in chunk['metadata']:
|
||||
chunk['metadata']['original_text'] = chunk['text']
|
||||
|
|
@ -71,6 +85,13 @@ class VectorIndexer:
|
|||
"metadata": json.dumps(chunk)
|
||||
})
|
||||
|
||||
if skipped_count > 0:
|
||||
print(f"⚠️ Skipped {skipped_count} chunks due to invalid embeddings (NaN or infinite values)")
|
||||
|
||||
if not data:
|
||||
print("❌ No valid embeddings to index after filtering out NaN/infinite values")
|
||||
return
|
||||
|
||||
# Incremental indexing: append to existing table if present, otherwise create it
|
||||
db = self.db_manager.db # underlying LanceDB connection
|
||||
|
||||
|
|
@ -81,8 +102,20 @@ class VectorIndexer:
|
|||
print(f"Creating table '{table_name}' (new) and adding {len(data)} vectors...")
|
||||
tbl = self.db_manager.create_table(table_name, schema=schema, mode="create")
|
||||
|
||||
tbl.add(data)
|
||||
print(f"Indexed {len(data)} vectors into table '{table_name}'.")
|
||||
# Add data with NaN handling configuration
|
||||
try:
|
||||
tbl.add(data, on_bad_vectors='drop')
|
||||
print(f"✅ Indexed {len(data)} vectors into table '{table_name}'.")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to add data to table: {e}")
|
||||
# Fallback: try with fill strategy
|
||||
try:
|
||||
print("🔄 Retrying with NaN fill strategy...")
|
||||
tbl.add(data, on_bad_vectors='fill', fill_value=0.0)
|
||||
print(f"✅ Indexed {len(data)} vectors into table '{table_name}' (with NaN fill).")
|
||||
except Exception as e2:
|
||||
print(f"❌ Failed to add data even with NaN fill: {e2}")
|
||||
raise
|
||||
|
||||
# BM25Indexer is no longer needed as we are moving to LanceDB's native FTS.
|
||||
# class BM25Indexer:
|
||||
|
|
|
|||
|
|
@ -76,5 +76,13 @@ class LateChunkEncoder:
|
|||
# Fallback: if tokenizer lost the span (e.g. due to trimming) just average CLS + SEP
|
||||
token_indices = [0]
|
||||
chunk_vec = last_hidden[token_indices].mean(dim=0).numpy().astype("float32")
|
||||
|
||||
# Check for NaN or infinite values
|
||||
if np.isnan(chunk_vec).any() or np.isinf(chunk_vec).any():
|
||||
print(f"⚠️ Warning: Invalid values detected in late chunk embedding for span ({start_char}, {end_char})")
|
||||
# Replace invalid values with zeros
|
||||
chunk_vec = np.nan_to_num(chunk_vec, nan=0.0, posinf=0.0, neginf=0.0)
|
||||
print(f"🔄 Replaced invalid values with zeros")
|
||||
|
||||
vectors.append(chunk_vec)
|
||||
return vectors
|
||||
|
|
@ -52,7 +52,24 @@ class QwenEmbedder(EmbeddingModel):
|
|||
seq_len = inputs["attention_mask"].sum(dim=1) - 1 # index of last token
|
||||
batch_indices = torch.arange(last_hidden.size(0), device=self.device)
|
||||
embeddings = last_hidden[batch_indices, seq_len]
|
||||
return embeddings.cpu().numpy()
|
||||
|
||||
# Convert to numpy and validate
|
||||
embeddings_np = embeddings.cpu().numpy()
|
||||
|
||||
# Check for NaN or infinite values
|
||||
if np.isnan(embeddings_np).any():
|
||||
print(f"⚠️ Warning: NaN values detected in embeddings from {self.model_name}")
|
||||
# Replace NaN values with zeros
|
||||
embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
|
||||
print(f"🔄 Replaced NaN values with zeros")
|
||||
|
||||
if np.isinf(embeddings_np).any():
|
||||
print(f"⚠️ Warning: Infinite values detected in embeddings from {self.model_name}")
|
||||
# Replace infinite values with zeros
|
||||
embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
|
||||
print(f"🔄 Replaced infinite values with zeros")
|
||||
|
||||
return embeddings_np
|
||||
|
||||
class EmbeddingGenerator:
|
||||
def __init__(self, embedding_model: EmbeddingModel, batch_size: int = 50):
|
||||
|
|
@ -108,7 +125,22 @@ class OllamaEmbedder(EmbeddingModel):
|
|||
def create_embeddings(self, texts: List[str]):
|
||||
import numpy as np
|
||||
vectors = [self._embed_single(t) for t in texts]
|
||||
return np.vstack(vectors)
|
||||
embeddings_np = np.vstack(vectors)
|
||||
|
||||
# Check for NaN or infinite values
|
||||
if np.isnan(embeddings_np).any():
|
||||
print(f"⚠️ Warning: NaN values detected in Ollama embeddings from {self.model_name}")
|
||||
# Replace NaN values with zeros
|
||||
embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
|
||||
print(f"🔄 Replaced NaN values with zeros")
|
||||
|
||||
if np.isinf(embeddings_np).any():
|
||||
print(f"⚠️ Warning: Infinite values detected in Ollama embeddings from {self.model_name}")
|
||||
# Replace infinite values with zeros
|
||||
embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
|
||||
print(f"🔄 Replaced infinite values with zeros")
|
||||
|
||||
return embeddings_np
|
||||
|
||||
def select_embedder(model_name: str, ollama_host: str | None = None):
|
||||
"""Return appropriate EmbeddingModel implementation for the given name."""
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user