5
5
# Currently, uses naive approaches. Nothing fancy.
6
6
#
7
7
####
8
- # Import necessary libraries
9
8
import hashlib
10
9
import json
11
10
import re
14
13
#
15
14
# Import 3rd party
16
15
from loguru import logger
17
- from tqdm import tqdm
18
16
from langdetect import detect , LangDetectException # Import specific exception
19
- from transformers import AutoTokenizer , PreTrainedTokenizerBase # Using AutoTokenizer for flexibility
20
17
import nltk
21
18
from nltk .tokenize import sent_tokenize
22
- from sklearn .feature_extraction .text import TfidfVectorizer
23
- from sklearn .metrics .pairwise import cosine_similarity
19
+
24
20
#
25
21
# Import Local
26
22
from tldw_chatbook .config import load_settings , get_cli_setting
27
- from tldw_chatbook .config import global_default_chunk_language
28
23
#
29
- # FIXME
30
- def load_and_log_configs ():
31
- pass
32
- #######################################################################################################################
33
24
#######################################################################################################################
34
25
# Custom Exceptions
35
26
class ChunkingError (Exception ):
@@ -160,16 +151,28 @@ def __init__(self,
160
151
161
152
logger .debug (f"Chunker initialized with options: { self .options } " )
162
153
163
- try :
164
- # Use the tokenizer specified in options if available, otherwise use the argument
165
- tokenizer_to_load = self .options .get ('tokenizer_name_or_path' , tokenizer_name_or_path )
166
- self .tokenizer : PreTrainedTokenizerBase = AutoTokenizer .from_pretrained (tokenizer_to_load )
167
- logger .info (f"Tokenizer '{ tokenizer_to_load } ' loaded successfully." )
168
- except Exception as e :
169
- logger .error (f"Failed to load tokenizer '{ self .options .get ('tokenizer_name_or_path' , tokenizer_name_or_path )} ': { e } . Some token-based methods may fail." )
170
- # Fallback or raise error? For now, set to None and let methods handle it.
171
- self .tokenizer = None
172
- # raise ChunkingError(f"Failed to load tokenizer: {e}") from e
154
+ from transformers import PreTrainedTokenizerBase
155
+ self ._tokenizer : Optional [PreTrainedTokenizerBase ] = None
156
+ self ._tokenizer_path_to_load : str = self .options .get ('tokenizer_name_or_path' , tokenizer_name_or_path )
157
+
158
+ from transformers import PreTrainedTokenizerBase
159
+ @property
160
+ def tokenizer (self ) -> PreTrainedTokenizerBase :
161
+ if self ._tokenizer is None :
162
+ try :
163
+ from transformers import AutoTokenizer , PreTrainedTokenizerBase # Import here
164
+ logger .info (f"Lazily loading tokenizer: { self ._tokenizer_path_to_load } " )
165
+ self ._tokenizer = AutoTokenizer .from_pretrained (self ._tokenizer_path_to_load )
166
+ except ImportError :
167
+ logger .error ("Transformers library not found. Please install it to use token-based chunking." )
168
+ raise ChunkingError ("Transformers library not found." )
169
+ except Exception as e :
170
+ logger .error (f"Failed to lazy-load tokenizer '{ self ._tokenizer_path_to_load } ': { e } " )
171
+ # Optionally, raise a more specific error or allow fallback if applicable
172
+ raise ChunkingError (f"Failed to load tokenizer: { e } " ) from e
173
+ if self ._tokenizer is None : # Should not happen if logic above is correct, but as a safeguard
174
+ raise ChunkingError ("Tokenizer could not be loaded." )
175
+ return self ._tokenizer
173
176
174
177
def _get_option (self , key : str , default_override : Optional [Any ] = None ) -> Any :
175
178
"""Helper to get an option, allowing for a dynamic default."""
@@ -259,10 +262,15 @@ def chunk_text(self,
259
262
base_adaptive_size = self ._get_option ('base_adaptive_chunk_size' )
260
263
min_adaptive_size = self ._get_option ('min_adaptive_chunk_size' )
261
264
max_adaptive_size = self ._get_option ('max_adaptive_chunk_size' )
262
- if self .tokenizer : # NLTK based adaptive_chunk_size needs punkt
263
- max_size = self ._adaptive_chunk_size_nltk (text , base_adaptive_size , min_adaptive_size , max_adaptive_size , language )
264
- else : # Fallback if no tokenizer for NLTK based one.
265
- max_size = self ._adaptive_chunk_size_non_punkt (text , base_adaptive_size , min_adaptive_size , max_adaptive_size )
265
+ # Accessing self.tokenizer property here will trigger lazy loading if not already loaded.
266
+ try :
267
+ if self .tokenizer : # NLTK based adaptive_chunk_size needs punkt
268
+ max_size = self ._adaptive_chunk_size_nltk (text , base_adaptive_size , min_adaptive_size , max_adaptive_size , language )
269
+ else : # Fallback if no tokenizer for NLTK based one. (tokenizer property would have raised if failed to load)
270
+ max_size = self ._adaptive_chunk_size_non_punkt (text , base_adaptive_size , min_adaptive_size , max_adaptive_size )
271
+ except ChunkingError : # Raised by tokenizer property if transformers not found or load fails
272
+ logger .warning ("Tokenizer could not be loaded for adaptive chunk sizing. Using non-NLTK adaptive sizing." )
273
+ max_size = self ._adaptive_chunk_size_non_punkt (text , base_adaptive_size , min_adaptive_size , max_adaptive_size )
266
274
logger .info (f"Adaptive chunking adjusted max_size to: { max_size } " )
267
275
268
276
@@ -279,8 +287,7 @@ def chunk_text(self,
279
287
elif chunk_method == 'paragraphs' :
280
288
return self ._chunk_text_by_paragraphs (text , max_paragraphs = max_size , overlap = overlap )
281
289
elif chunk_method == 'tokens' :
282
- if not self .tokenizer :
283
- raise ChunkingError ("Tokenizer not loaded, cannot use 'tokens' chunking method." )
290
+ # self.tokenizer will raise ChunkingError if it cannot be loaded by its property.
284
291
return self ._chunk_text_by_tokens (text , max_tokens = max_size , overlap = overlap )
285
292
elif chunk_method == 'semantic' :
286
293
# semantic_chunking needs to be a method of the class too
@@ -301,8 +308,7 @@ def chunk_text(self,
301
308
elif chunk_method == 'rolling_summarize' :
302
309
if not llm_call_function :
303
310
raise ChunkingError ("Missing 'llm_call_function' for 'rolling_summarize' method." )
304
- if not self .tokenizer : # Still need tokenizer for token counting in helper
305
- raise ChunkingError ("Tokenizer required for 'rolling_summarize' to estimate chunk sizes for LLM." )
311
+ # self.tokenizer will raise ChunkingError if it cannot be loaded by its property.
306
312
307
313
summary = self ._rolling_summarize (
308
314
text_to_summarize = text ,
@@ -486,10 +492,8 @@ def _chunk_text_by_paragraphs(self, text: str, max_paragraphs: int, overlap: int
486
492
487
493
def _chunk_text_by_tokens (self , text : str , max_tokens : int , overlap : int ) -> List [str ]:
488
494
# This uses the accurate tokenizer version
489
- if not self .tokenizer :
490
- logger .error ("Tokenizer not available for token-based chunking." )
491
- raise ChunkingError ("Tokenizer not loaded, cannot use 'tokens' chunking method." )
492
-
495
+ # Accessing self.tokenizer property here will trigger lazy loading.
496
+ # If it fails, ChunkingError will be raised by the property.
493
497
logger .info (f"Chunking by tokens: max_tokens={ max_tokens } , overlap_tokens={ overlap } (token overlap)" )
494
498
if max_tokens <= 0 :
495
499
logger .warning ("max_tokens must be positive. Returning single chunk or empty." )
@@ -642,11 +646,16 @@ def _semantic_chunking(self, text: str, max_chunk_size: int, unit: str) -> List[
642
646
def _count_units (txt : str , unit_type : str ) -> int :
643
647
if unit_type == 'words' :
644
648
return len (txt .split ())
645
- elif unit_type == 'tokens' and self .tokenizer :
649
+ elif unit_type == 'tokens' : # self.tokenizer property will be used here
646
650
return len (self .tokenizer .encode (txt ))
647
651
elif unit_type == 'characters' :
648
652
return len (txt )
649
- logger .warning (f"Unknown unit type '{ unit_type } ' or tokenizer missing for tokens. Defaulting to word count." )
653
+ # Tokenizer might not be available if transformers is not installed.
654
+ # The self.tokenizer property would raise ChunkingError if called when not available.
655
+ # So, if unit_type is 'tokens' and we reach here, it should be available.
656
+ # However, to be safe, let's consider the case it might still be None if an error occurred
657
+ # but wasn't propagated in a way that prevented this call.
658
+ logger .warning (f"Unknown unit type '{ unit_type } ' or tokenizer issues for tokens. Defaulting to word count." )
650
659
return len (txt .split ())
651
660
652
661
@@ -927,9 +936,18 @@ def _chunk_ebook_by_chapters(self, text: str, max_size: int, overlap: int, custo
927
936
for i , chap_data in enumerate (chapter_splits ):
928
937
chap_data ['metadata' ]['chunk_index_in_book' ] = i + 1
929
938
chap_data ['metadata' ]['total_chapters_detected' ] = len (chapter_splits )
930
- tokenizer_available = hasattr (self , 'tokenizer' ) and self .tokenizer and hasattr (self .tokenizer ,
931
- 'encode' ) and callable (
932
- self .tokenizer .encode )
939
+ # Access self.tokenizer property, will lazy load or raise.
940
+ tokenizer_available = False
941
+ try :
942
+ # Check if tokenizer can be accessed and used
943
+ _ = self .tokenizer .encode ("test" ) # A simple check that it works
944
+ tokenizer_available = True
945
+ except ChunkingError : # From tokenizer property
946
+ logger .warning ("Tokenizer not available for sub-chunking ebook chapters by tokens." )
947
+ except Exception as e_tok_check : # Other unexpected errors
948
+ logger .warning (f"Unexpected error checking tokenizer for ebook sub-chunking: { e_tok_check } " )
949
+
950
+
933
951
if max_size > 0 and tokenizer_available and len (
934
952
# FIXME
935
953
self .tokenizer .encode (chap_data ['text' ])) > max_size :
@@ -1045,9 +1063,7 @@ def _rolling_summarize(self,
1045
1063
system_prompt_content : str ,
1046
1064
additional_instructions : Optional [str ]
1047
1065
) -> str :
1048
- if not self .tokenizer : # Should have been checked by caller (chunk_text)
1049
- raise ChunkingError ("Tokenizer required for rolling summarization." )
1050
-
1066
+ # self.tokenizer property will be accessed here.
1051
1067
logger .info (f"Rolling summarization called. Detail: { detail } " )
1052
1068
text_token_length = len (self .tokenizer .encode (text_to_summarize ))
1053
1069
max_summarization_chunks = max (1 , text_token_length // min_chunk_tokens )
@@ -1070,6 +1086,14 @@ def _rolling_summarize(self,
1070
1086
if additional_instructions :
1071
1087
final_system_prompt += f"\n \n { additional_instructions } "
1072
1088
1089
+ try :
1090
+ from tqdm import tqdm # Import here
1091
+ except ImportError :
1092
+ logger .warning ("tqdm library not found. Progress bar for summarization parts will be disabled. Install with 'pip install tqdm'." )
1093
+ # Define a dummy tqdm if not found, so the loop doesn't break
1094
+ def tqdm (iterable , * args , ** kwargs ):
1095
+ return iterable
1096
+
1073
1097
accumulated_summaries = []
1074
1098
for i , chunk_for_llm in enumerate (tqdm (text_chunks_for_llm , desc = "Summarizing parts" , disable = not verbose )):
1075
1099
user_message_content = chunk_for_llm
@@ -1117,8 +1141,7 @@ def _combine_chunks_for_llm(self,
1117
1141
header : Optional [str ] = None ,
1118
1142
add_ellipsis_for_overflow : bool = True ,
1119
1143
) -> Tuple [List [str ], List [List [int ]], int ]:
1120
- if not self .tokenizer :
1121
- raise ChunkingError ("Tokenizer required for _combine_chunks_for_llm." )
1144
+ # self.tokenizer property will be accessed here.
1122
1145
1123
1146
dropped_chunk_count = 0
1124
1147
output_combined_texts = []
0 commit comments