Skip to content

Commit 37c251e

Browse files
committed
Embeddings
1 parent 5d6d548 commit 37c251e

File tree

6 files changed

+160
-99
lines changed

6 files changed

+160
-99
lines changed

tldw_chatbook/Embeddings/Chroma_Lib.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,16 @@ def process_and_store_content(self, # Uses embedding_factory
527527

528528
logger.debug(f"process_and_store_content: Preparing chunks for embedding, contextualized={create_contextualized}")
529529
for i, chunk in enumerate(chunks):
530-
chunk_text = chunk['text']
530+
# Handle both 'text_for_embedding' (from chunk_for_embedding) and 'text' (from other chunking methods)
531+
if 'text_for_embedding' in chunk:
532+
chunk_text = chunk['text_for_embedding']
533+
logger.debug(f"process_and_store_content: Using 'text_for_embedding' field for chunk {i+1}")
534+
elif 'text' in chunk:
535+
chunk_text = chunk['text']
536+
logger.debug(f"process_and_store_content: Using 'text' field for chunk {i+1}")
537+
else:
538+
logger.error(f"process_and_store_content: Chunk {i+1} missing both 'text' and 'text_for_embedding' fields: {list(chunk.keys())}")
539+
raise ValueError(f"Chunk {i+1} missing text content field")
531540
docs_for_chroma.append(chunk_text)
532541

533542
if create_contextualized:
@@ -559,11 +568,21 @@ def process_and_store_content(self, # Uses embedding_factory
559568
ids = [f"{media_id}_chunk_{i}" for i in range(len(chunks))]
560569
metadatas = []
561570
for i, chunk_info in enumerate(chunks):
571+
# Handle both 'original_chunk_text' (from chunk_for_embedding) and 'text' (from other chunking methods)
572+
if 'original_chunk_text' in chunk_info:
573+
original_text = chunk_info['original_chunk_text']
574+
logger.debug(f"process_and_store_content: Using 'original_chunk_text' field for metadata in chunk {i+1}")
575+
elif 'text' in chunk_info:
576+
original_text = chunk_info['text']
577+
logger.debug(f"process_and_store_content: Using 'text' field for metadata in chunk {i+1}")
578+
else:
579+
logger.error(f"process_and_store_content: Chunk {i+1} missing both 'text' and 'original_chunk_text' fields for metadata: {list(chunk_info.keys())}")
580+
original_text = "Text content unavailable"
581+
562582
meta = {
563583
"media_id": str(media_id), "chunk_index": i, "total_chunks": len(chunks),
564584
"file_name": str(file_name), "contextualized": create_contextualized,
565-
"original_chunk_text_ref": chunk_info['text'][:200] + "..." if len(
566-
chunk_info['text']) > 200 else chunk_info['text']
585+
"original_chunk_text_ref": original_text[:200] + "..." if len(original_text) > 200 else original_text
567586
}
568587
meta.update(chunk_info.get('metadata', {}))
569588
if create_contextualized:

0 commit comments

Comments
 (0)