@@ -325,13 +325,14 @@ def chunk_text(self,
325
325
326
326
327
327
def _chunk_text_by_words (self , text : str , max_words : int , overlap : int , language : str ) -> List [str ]:
328
- logger .debug (f"Chunking by words: max_words={ max_words } , overlap={ overlap } , language='{ language } '" )
328
+ logger .info (f"Chunking by words: max_words={ max_words } , overlap={ overlap } , language='{ language } '" )
329
329
# Language-specific word tokenization
330
330
words : List [str ]
331
331
if language .startswith ('zh' ): # Chinese
332
332
try :
333
333
import jieba
334
334
words = list (jieba .cut (text ))
335
+ logger .debug (f"Using jieba for Chinese word tokenization, found { len (words )} words" )
335
336
except ImportError :
336
337
logger .warning ("jieba library not found for Chinese word tokenization. Falling back to space splitting." )
337
338
words = text .split ()
@@ -371,11 +372,13 @@ def _chunk_text_by_words(self, text: str, max_words: int, overlap: int, language
371
372
chunks .append (' ' .join (chunk_words ))
372
373
logger .debug (f"Created word chunk { len (chunks )} with { len (chunk_words )} words" )
373
374
374
- return self ._post_process_chunks (chunks )
375
+ processed_chunks = self ._post_process_chunks (chunks )
376
+ logger .info (f"Word chunking complete: created { len (processed_chunks )} chunks from { len (words )} words" )
377
+ return processed_chunks
375
378
376
379
377
380
def _chunk_text_by_sentences (self , text : str , max_sentences : int , overlap : int , language : str ) -> List [str ]:
378
- logger .debug (f"Chunking by sentences: max_sentences={ max_sentences } , overlap={ overlap } , lang='{ language } '" )
381
+ logger .info (f"Chunking by sentences: max_sentences={ max_sentences } , overlap={ overlap } , lang='{ language } '" )
379
382
sentences : List [str ]
380
383
381
384
if language .startswith ('zh' ):
@@ -445,11 +448,15 @@ def _chunk_text_by_sentences(self, text: str, max_sentences: int, overlap: int,
445
448
for i in range (0 , len (sentences ), step ):
446
449
chunk_sentences = sentences [i : i + max_sentences ]
447
450
chunks .append (' ' .join (chunk_sentences ))
448
- return self ._post_process_chunks (chunks )
451
+ logger .debug (f"Created sentence chunk { len (chunks )} with { len (chunk_sentences )} sentences" )
452
+
453
+ processed_chunks = self ._post_process_chunks (chunks )
454
+ logger .info (f"Sentence chunking complete: created { len (processed_chunks )} chunks from { len (sentences )} sentences" )
455
+ return processed_chunks
449
456
450
457
451
458
def _chunk_text_by_paragraphs (self , text : str , max_paragraphs : int , overlap : int ) -> List [str ]:
452
- logger .debug (f"Chunking by paragraphs: max_paragraphs={ max_paragraphs } , overlap={ overlap } " )
459
+ logger .info (f"Chunking by paragraphs: max_paragraphs={ max_paragraphs } , overlap={ overlap } " )
453
460
# Split by one or more empty lines (common paragraph delimiter)
454
461
paragraphs = re .split (r'\n\s*\n+' , text )
455
462
paragraphs = [p .strip () for p in paragraphs if p .strip ()] # Remove empty paragraphs
@@ -470,7 +477,11 @@ def _chunk_text_by_paragraphs(self, text: str, max_paragraphs: int, overlap: int
470
477
for i in range (0 , len (paragraphs ), step ):
471
478
chunk_paragraphs = paragraphs [i : i + max_paragraphs ]
472
479
chunks .append ('\n \n ' .join (chunk_paragraphs )) # Join with double newline to preserve paragraph structure
473
- return self ._post_process_chunks (chunks ) # post_process_chunks strips leading/trailing, which is fine
480
+ logger .debug (f"Created paragraph chunk { len (chunks )} with { len (chunk_paragraphs )} paragraphs" )
481
+
482
+ processed_chunks = self ._post_process_chunks (chunks ) # post_process_chunks strips leading/trailing, which is fine
483
+ logger .info (f"Paragraph chunking complete: created { len (processed_chunks )} chunks from { len (paragraphs )} paragraphs" )
484
+ return processed_chunks
474
485
475
486
476
487
def _chunk_text_by_tokens (self , text : str , max_tokens : int , overlap : int ) -> List [str ]:
@@ -479,7 +490,7 @@ def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> Lis
479
490
logger .error ("Tokenizer not available for token-based chunking." )
480
491
raise ChunkingError ("Tokenizer not loaded, cannot use 'tokens' chunking method." )
481
492
482
- logger .debug (f"Chunking by tokens: max_tokens={ max_tokens } , overlap_tokens={ overlap } (token overlap)" )
493
+ logger .info (f"Chunking by tokens: max_tokens={ max_tokens } , overlap_tokens={ overlap } (token overlap)" )
483
494
if max_tokens <= 0 :
484
495
logger .warning ("max_tokens must be positive. Returning single chunk or empty." )
485
496
return [text ] if text .strip () else []
@@ -501,7 +512,11 @@ def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> Lis
501
512
chunk_token_ids = tokens [i : i + max_tokens ]
502
513
chunk_text = self .tokenizer .decode (chunk_token_ids , skip_special_tokens = True ) # skip_special_tokens might be an option
503
514
chunks .append (chunk_text )
504
- return self ._post_process_chunks (chunks )
515
+ logger .debug (f"Created token chunk { len (chunks )} with { len (chunk_token_ids )} tokens" )
516
+
517
+ processed_chunks = self ._post_process_chunks (chunks )
518
+ logger .info (f"Token chunking complete: created { len (processed_chunks )} chunks from { len (tokens )} tokens" )
519
+ return processed_chunks
505
520
506
521
507
522
# --- Adaptive Chunking Methods ---
@@ -1357,8 +1372,9 @@ def improved_chunking_process(text: str,
1357
1372
llm_call_function_for_chunker : Optional [Callable ] = None ,
1358
1373
llm_api_config_for_chunker : Optional [Dict [str , Any ]] = None
1359
1374
) -> List [Dict [str , Any ]]:
1360
- logger .debug ("Improved chunking process started..." )
1375
+ logger .info ("Improved chunking process started..." )
1361
1376
logger .debug (f"Received chunk_options_dict: { chunk_options_dict } " )
1377
+ logger .debug (f"Text length: { len (text )} characters, tokenizer: { tokenizer_name_or_path } " )
1362
1378
1363
1379
chunker_instance = Chunker (options = chunk_options_dict ,
1364
1380
tokenizer_name_or_path = tokenizer_name_or_path ,
@@ -1421,8 +1437,10 @@ def improved_chunking_process(text: str,
1421
1437
1422
1438
chunks_with_metadata_list = []
1423
1439
total_chunks_count = len (raw_chunks )
1440
+ logger .info (f"Processing { total_chunks_count } chunks for metadata enrichment" )
1424
1441
try :
1425
1442
for i , chunk_item in enumerate (raw_chunks ):
1443
+ logger .debug (f"Processing chunk { i + 1 } /{ total_chunks_count } " )
1426
1444
actual_text_content : str
1427
1445
is_json_chunk = False
1428
1446
chunk_specific_metadata = {} # Initialize
@@ -1464,6 +1482,7 @@ def improved_chunking_process(text: str,
1464
1482
})
1465
1483
1466
1484
logger .debug (f"Successfully created metadata for all { len (chunks_with_metadata_list )} chunks" )
1485
+ logger .info (f"Improved chunking process completed: { len (chunks_with_metadata_list )} chunks created using method '{ effective_options ['method' ]} ', language: { effective_options .get ('language' , 'unknown' )} " )
1467
1486
return chunks_with_metadata_list
1468
1487
except Exception as e :
1469
1488
logger .error (f"Error creating chunk metadata: { e } " , exc_info = True )
@@ -1482,4 +1501,4 @@ def load_document(file_path: str) -> str:
1482
1501
raise
1483
1502
except Exception as e :
1484
1503
logger .error (f"Error loading document { file_path } : { e } " )
1485
- raise
1504
+ raise
0 commit comments