@@ -527,7 +527,16 @@ def process_and_store_content(self, # Uses embedding_factory
527
527
528
528
logger .debug (f"process_and_store_content: Preparing chunks for embedding, contextualized={ create_contextualized } " )
529
529
for i , chunk in enumerate (chunks ):
530
- chunk_text = chunk ['text' ]
530
+ # Handle both 'text_for_embedding' (from chunk_for_embedding) and 'text' (from other chunking methods)
531
+ if 'text_for_embedding' in chunk :
532
+ chunk_text = chunk ['text_for_embedding' ]
533
+ logger .debug (f"process_and_store_content: Using 'text_for_embedding' field for chunk { i + 1 } " )
534
+ elif 'text' in chunk :
535
+ chunk_text = chunk ['text' ]
536
+ logger .debug (f"process_and_store_content: Using 'text' field for chunk { i + 1 } " )
537
+ else :
538
+ logger .error (f"process_and_store_content: Chunk { i + 1 } missing both 'text' and 'text_for_embedding' fields: { list (chunk .keys ())} " )
539
+ raise ValueError (f"Chunk { i + 1 } missing text content field" )
531
540
docs_for_chroma .append (chunk_text )
532
541
533
542
if create_contextualized :
@@ -559,11 +568,21 @@ def process_and_store_content(self, # Uses embedding_factory
559
568
ids = [f"{ media_id } _chunk_{ i } " for i in range (len (chunks ))]
560
569
metadatas = []
561
570
for i , chunk_info in enumerate (chunks ):
571
+ # Handle both 'original_chunk_text' (from chunk_for_embedding) and 'text' (from other chunking methods)
572
+ if 'original_chunk_text' in chunk_info :
573
+ original_text = chunk_info ['original_chunk_text' ]
574
+ logger .debug (f"process_and_store_content: Using 'original_chunk_text' field for metadata in chunk { i + 1 } " )
575
+ elif 'text' in chunk_info :
576
+ original_text = chunk_info ['text' ]
577
+ logger .debug (f"process_and_store_content: Using 'text' field for metadata in chunk { i + 1 } " )
578
+ else :
579
+ logger .error (f"process_and_store_content: Chunk { i + 1 } missing both 'text' and 'original_chunk_text' fields for metadata: { list (chunk_info .keys ())} " )
580
+ original_text = "Text content unavailable"
581
+
562
582
meta = {
563
583
"media_id" : str (media_id ), "chunk_index" : i , "total_chunks" : len (chunks ),
564
584
"file_name" : str (file_name ), "contextualized" : create_contextualized ,
565
- "original_chunk_text_ref" : chunk_info ['text' ][:200 ] + "..." if len (
566
- chunk_info ['text' ]) > 200 else chunk_info ['text' ]
585
+ "original_chunk_text_ref" : original_text [:200 ] + "..." if len (original_text ) > 200 else original_text
567
586
}
568
587
meta .update (chunk_info .get ('metadata' , {}))
569
588
if create_contextualized :
0 commit comments