Skip to content

Commit 8cefe6d

Browse files
authored
Merge pull request #72 from rmusser01/dev
Dev
2 parents f974700 + 37c251e commit 8cefe6d

File tree

17 files changed

+1014
-766
lines changed

17 files changed

+1014
-766
lines changed

tldw_chatbook/Chat/Chat_Functions.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ def approximate_token_count(history):
7474
logger.error(f"Error calculating token count: {str(e)}")
7575
return 0
7676

77-
# FIXME - Validate below
7877
# 1. Dispatch table for handler functions
7978
API_CALL_HANDLERS = {
8079
'openai': chat_with_openai,

tldw_chatbook/Chunking/Chunk_Lib.py

Lines changed: 65 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# Currently, uses naive approaches. Nothing fancy.
66
#
77
####
8-
# Import necessary libraries
98
import hashlib
109
import json
1110
import re
@@ -14,22 +13,14 @@
1413
#
1514
# Import 3rd party
1615
from loguru import logger
17-
from tqdm import tqdm
1816
from langdetect import detect, LangDetectException # Import specific exception
19-
from transformers import AutoTokenizer, PreTrainedTokenizerBase # Using AutoTokenizer for flexibility
2017
import nltk
2118
from nltk.tokenize import sent_tokenize
22-
from sklearn.feature_extraction.text import TfidfVectorizer
23-
from sklearn.metrics.pairwise import cosine_similarity
19+
2420
#
2521
# Import Local
2622
from tldw_chatbook.config import load_settings, get_cli_setting
27-
from tldw_chatbook.config import global_default_chunk_language
2823
#
29-
# FIXME
30-
def load_and_log_configs():
31-
pass
32-
#######################################################################################################################
3324
#######################################################################################################################
3425
# Custom Exceptions
3526
class ChunkingError(Exception):
@@ -160,16 +151,28 @@ def __init__(self,
160151

161152
logger.debug(f"Chunker initialized with options: {self.options}")
162153

163-
try:
164-
# Use the tokenizer specified in options if available, otherwise use the argument
165-
tokenizer_to_load = self.options.get('tokenizer_name_or_path', tokenizer_name_or_path)
166-
self.tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(tokenizer_to_load)
167-
logger.info(f"Tokenizer '{tokenizer_to_load}' loaded successfully.")
168-
except Exception as e:
169-
logger.error(f"Failed to load tokenizer '{self.options.get('tokenizer_name_or_path', tokenizer_name_or_path)}': {e}. Some token-based methods may fail.")
170-
# Fallback or raise error? For now, set to None and let methods handle it.
171-
self.tokenizer = None
172-
# raise ChunkingError(f"Failed to load tokenizer: {e}") from e
154+
from transformers import PreTrainedTokenizerBase
155+
self._tokenizer: Optional[PreTrainedTokenizerBase] = None
156+
self._tokenizer_path_to_load: str = self.options.get('tokenizer_name_or_path', tokenizer_name_or_path)
157+
158+
from transformers import PreTrainedTokenizerBase
159+
@property
160+
def tokenizer(self) -> PreTrainedTokenizerBase:
161+
if self._tokenizer is None:
162+
try:
163+
from transformers import AutoTokenizer, PreTrainedTokenizerBase # Import here
164+
logger.info(f"Lazily loading tokenizer: {self._tokenizer_path_to_load}")
165+
self._tokenizer = AutoTokenizer.from_pretrained(self._tokenizer_path_to_load)
166+
except ImportError:
167+
logger.error("Transformers library not found. Please install it to use token-based chunking.")
168+
raise ChunkingError("Transformers library not found.")
169+
except Exception as e:
170+
logger.error(f"Failed to lazy-load tokenizer '{self._tokenizer_path_to_load}': {e}")
171+
# Optionally, raise a more specific error or allow fallback if applicable
172+
raise ChunkingError(f"Failed to load tokenizer: {e}") from e
173+
if self._tokenizer is None: # Should not happen if logic above is correct, but as a safeguard
174+
raise ChunkingError("Tokenizer could not be loaded.")
175+
return self._tokenizer
173176

174177
def _get_option(self, key: str, default_override: Optional[Any] = None) -> Any:
175178
"""Helper to get an option, allowing for a dynamic default."""
@@ -259,10 +262,15 @@ def chunk_text(self,
259262
base_adaptive_size = self._get_option('base_adaptive_chunk_size')
260263
min_adaptive_size = self._get_option('min_adaptive_chunk_size')
261264
max_adaptive_size = self._get_option('max_adaptive_chunk_size')
262-
if self.tokenizer: # NLTK based adaptive_chunk_size needs punkt
263-
max_size = self._adaptive_chunk_size_nltk(text, base_adaptive_size, min_adaptive_size, max_adaptive_size, language)
264-
else: # Fallback if no tokenizer for NLTK based one.
265-
max_size = self._adaptive_chunk_size_non_punkt(text, base_adaptive_size, min_adaptive_size, max_adaptive_size)
265+
# Accessing self.tokenizer property here will trigger lazy loading if not already loaded.
266+
try:
267+
if self.tokenizer: # NLTK based adaptive_chunk_size needs punkt
268+
max_size = self._adaptive_chunk_size_nltk(text, base_adaptive_size, min_adaptive_size, max_adaptive_size, language)
269+
else: # Fallback if no tokenizer for NLTK based one. (tokenizer property would have raised if failed to load)
270+
max_size = self._adaptive_chunk_size_non_punkt(text, base_adaptive_size, min_adaptive_size, max_adaptive_size)
271+
except ChunkingError: # Raised by tokenizer property if transformers not found or load fails
272+
logger.warning("Tokenizer could not be loaded for adaptive chunk sizing. Using non-NLTK adaptive sizing.")
273+
max_size = self._adaptive_chunk_size_non_punkt(text, base_adaptive_size, min_adaptive_size, max_adaptive_size)
266274
logger.info(f"Adaptive chunking adjusted max_size to: {max_size}")
267275

268276

@@ -279,8 +287,7 @@ def chunk_text(self,
279287
elif chunk_method == 'paragraphs':
280288
return self._chunk_text_by_paragraphs(text, max_paragraphs=max_size, overlap=overlap)
281289
elif chunk_method == 'tokens':
282-
if not self.tokenizer:
283-
raise ChunkingError("Tokenizer not loaded, cannot use 'tokens' chunking method.")
290+
# self.tokenizer will raise ChunkingError if it cannot be loaded by its property.
284291
return self._chunk_text_by_tokens(text, max_tokens=max_size, overlap=overlap)
285292
elif chunk_method == 'semantic':
286293
# semantic_chunking needs to be a method of the class too
@@ -301,8 +308,7 @@ def chunk_text(self,
301308
elif chunk_method == 'rolling_summarize':
302309
if not llm_call_function:
303310
raise ChunkingError("Missing 'llm_call_function' for 'rolling_summarize' method.")
304-
if not self.tokenizer: # Still need tokenizer for token counting in helper
305-
raise ChunkingError("Tokenizer required for 'rolling_summarize' to estimate chunk sizes for LLM.")
311+
# self.tokenizer will raise ChunkingError if it cannot be loaded by its property.
306312

307313
summary = self._rolling_summarize(
308314
text_to_summarize=text,
@@ -486,10 +492,8 @@ def _chunk_text_by_paragraphs(self, text: str, max_paragraphs: int, overlap: int
486492

487493
def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> List[str]:
488494
# This uses the accurate tokenizer version
489-
if not self.tokenizer:
490-
logger.error("Tokenizer not available for token-based chunking.")
491-
raise ChunkingError("Tokenizer not loaded, cannot use 'tokens' chunking method.")
492-
495+
# Accessing self.tokenizer property here will trigger lazy loading.
496+
# If it fails, ChunkingError will be raised by the property.
493497
logger.info(f"Chunking by tokens: max_tokens={max_tokens}, overlap_tokens={overlap} (token overlap)")
494498
if max_tokens <= 0:
495499
logger.warning("max_tokens must be positive. Returning single chunk or empty.")
@@ -642,11 +646,16 @@ def _semantic_chunking(self, text: str, max_chunk_size: int, unit: str) -> List[
642646
def _count_units(txt: str, unit_type: str) -> int:
643647
if unit_type == 'words':
644648
return len(txt.split())
645-
elif unit_type == 'tokens' and self.tokenizer:
649+
elif unit_type == 'tokens': # self.tokenizer property will be used here
646650
return len(self.tokenizer.encode(txt))
647651
elif unit_type == 'characters':
648652
return len(txt)
649-
logger.warning(f"Unknown unit type '{unit_type}' or tokenizer missing for tokens. Defaulting to word count.")
653+
# Tokenizer might not be available if transformers is not installed.
654+
# The self.tokenizer property would raise ChunkingError if called when not available.
655+
# So, if unit_type is 'tokens' and we reach here, it should be available.
656+
# However, to be safe, let's consider the case it might still be None if an error occurred
657+
# but wasn't propagated in a way that prevented this call.
658+
logger.warning(f"Unknown unit type '{unit_type}' or tokenizer issues for tokens. Defaulting to word count.")
650659
return len(txt.split())
651660

652661

@@ -927,9 +936,18 @@ def _chunk_ebook_by_chapters(self, text: str, max_size: int, overlap: int, custo
927936
for i, chap_data in enumerate(chapter_splits):
928937
chap_data['metadata']['chunk_index_in_book'] = i + 1
929938
chap_data['metadata']['total_chapters_detected'] = len(chapter_splits)
930-
tokenizer_available = hasattr(self, 'tokenizer') and self.tokenizer and hasattr(self.tokenizer,
931-
'encode') and callable(
932-
self.tokenizer.encode)
939+
# Access self.tokenizer property, will lazy load or raise.
940+
tokenizer_available = False
941+
try:
942+
# Check if tokenizer can be accessed and used
943+
_ = self.tokenizer.encode("test") # A simple check that it works
944+
tokenizer_available = True
945+
except ChunkingError: # From tokenizer property
946+
logger.warning("Tokenizer not available for sub-chunking ebook chapters by tokens.")
947+
except Exception as e_tok_check: # Other unexpected errors
948+
logger.warning(f"Unexpected error checking tokenizer for ebook sub-chunking: {e_tok_check}")
949+
950+
933951
if max_size > 0 and tokenizer_available and len(
934952
# FIXME
935953
self.tokenizer.encode(chap_data['text'])) > max_size:
@@ -1045,9 +1063,7 @@ def _rolling_summarize(self,
10451063
system_prompt_content: str,
10461064
additional_instructions: Optional[str]
10471065
) -> str:
1048-
if not self.tokenizer: # Should have been checked by caller (chunk_text)
1049-
raise ChunkingError("Tokenizer required for rolling summarization.")
1050-
1066+
# self.tokenizer property will be accessed here.
10511067
logger.info(f"Rolling summarization called. Detail: {detail}")
10521068
text_token_length = len(self.tokenizer.encode(text_to_summarize))
10531069
max_summarization_chunks = max(1, text_token_length // min_chunk_tokens)
@@ -1070,6 +1086,14 @@ def _rolling_summarize(self,
10701086
if additional_instructions:
10711087
final_system_prompt += f"\n\n{additional_instructions}"
10721088

1089+
try:
1090+
from tqdm import tqdm # Import here
1091+
except ImportError:
1092+
logger.warning("tqdm library not found. Progress bar for summarization parts will be disabled. Install with 'pip install tqdm'.")
1093+
# Define a dummy tqdm if not found, so the loop doesn't break
1094+
def tqdm(iterable, *args, **kwargs):
1095+
return iterable
1096+
10731097
accumulated_summaries = []
10741098
for i, chunk_for_llm in enumerate(tqdm(text_chunks_for_llm, desc="Summarizing parts", disable=not verbose)):
10751099
user_message_content = chunk_for_llm
@@ -1117,8 +1141,7 @@ def _combine_chunks_for_llm(self,
11171141
header: Optional[str] = None,
11181142
add_ellipsis_for_overflow: bool = True,
11191143
) -> Tuple[List[str], List[List[int]], int]:
1120-
if not self.tokenizer:
1121-
raise ChunkingError("Tokenizer required for _combine_chunks_for_llm.")
1144+
# self.tokenizer property will be accessed here.
11221145

11231146
dropped_chunk_count = 0
11241147
output_combined_texts = []

tldw_chatbook/Coding/code_mapper.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
from collections import defaultdict
99
#
1010
# Third-Party Imports
11-
from aider.repo_map import RepoMap, find_src_files # Import necessary components
11+
#
12+
# Local Imports
13+
from tldw_chatbook.Third_Party.aider.repomap import RepoMap
1214
#
1315
########################################################################################################################
1416
#

tldw_chatbook/Constants.py

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1974,9 +1974,28 @@
19741974
MLX_LM_SERVER_ARGS_HELP_TEXT = """
19751975
[bold cyan]--- MLX-LM Server Arguments ---[/]
19761976
1977+
options:
1978+
--adapter-path ADAPTER_PATH
1979+
Optional path for the trained adapter weights and
1980+
config.
1981+
1982+
1983+
1984+
--temp TEMP Default sampling temperature (default: 0.0)
1985+
--top-p TOP_P Default nucleus sampling top-p (default: 1.0)
1986+
--top-k TOP_K Default top-k sampling (default: 0, disables top-k)
1987+
--min-p MIN_P Default min-p sampling (default: 0.0, disables min-p)
1988+
--max-tokens MAX_TOKENS
1989+
Default maximum number of tokens to generate (default:
1990+
512)
1991+
--chat-template-args CHAT_TEMPLATE_ARGS
1992+
A JSON formatted string of arguments for the
1993+
tokenizer's apply_chat_template, e.g.
1994+
'{"enable_thinking":false}'
1995+
19771996
[bold]--model MODEL[/]
1978-
Path to the model directory or HuggingFace model ID
1979-
(e.g., [italic]--model mlx-community/Nous-Hermes-2-Mistral-7B-DPO-4bit-MLX[/])
1997+
The path to the MLX model weights, tokenizer, and config
1998+
(e.g., [italic]--model mlx-community/Qwen3-30B-A3B-4bit[/])
19801999
19812000
[bold]--host HOST[/]
19822001
Host address to bind the server to (default: 127.0.0.1)
@@ -1986,9 +2005,21 @@
19862005
Port to run the server on (default: 8080)
19872006
(e.g., [italic]--port 8000[/])
19882007
1989-
[bold]--max-tokens N[/]
1990-
Maximum number of tokens to generate (default: 100)
1991-
(e.g., [italic]--max-tokens 512[/])
2008+
[bold]--draft-model DRAFT_MODEL[/]
2009+
A model to be used for speculative decoding.
2010+
(e.g., [italic]--draft-model mlx-community/Qwen3-0.6B-8bit[/])
2011+
2012+
[bold]--num-draft-tokens NUM_DRAFT_TOKENS[/]
2013+
Number of tokens to draft when using speculative decoding.
2014+
2015+
[bold]--trust-remote-code[/]
2016+
Enable trusting remote code for tokenizer
2017+
2018+
[bold]--chat-template CHAT_TEMPLATE[/]
2019+
Specify a chat template for the tokenizer
2020+
2021+
[bold]--use-default-chat-template[/]
2022+
Use the default chat template
19922023
19932024
[bold]--temperature TEMP[/]
19942025
Sampling temperature (default: 0.8)
@@ -2002,28 +2033,15 @@
20022033
Top-k sampling (default: 40)
20032034
(e.g., [italic]--top-k 50[/])
20042035
2005-
[bold]--seed SEED[/]
2006-
Random seed for reproducibility (default: None)
2007-
(e.g., [italic]--seed 42[/])
2036+
[bold]--min-p MIN_P[/]
2037+
Default min-p sampling (default: 0.0, disables min-p)
20082038
2009-
[bold]--batch-size N[/]
2010-
Batch size for inference (default: 1)
2011-
(e.g., [italic]--batch-size 4[/])
2012-
2013-
[bold]--quantization {int8,int4,fp16,fp32}[/]
2014-
Quantization method to use (default: None)
2015-
(e.g., [italic]--quantization int4[/])
2016-
2017-
[bold]--device {cpu,gpu}[/]
2018-
Device to run inference on (default: auto-detect)
2019-
(e.g., [italic]--device gpu[/])
2020-
2021-
[bold]--trust-remote-code[/]
2022-
Trust remote code when loading models from HuggingFace
2039+
[bold]--max-tokens N[/]
2040+
Maximum number of tokens to generate (default: 100)
2041+
(e.g., [italic]--max-tokens 512[/])
20232042
2024-
[bold]--revision REVISION[/]
2025-
Specific model revision to use from HuggingFace
2026-
(e.g., [italic]--revision main[/])
2043+
[bold]--chat-template-args CHAT_TEMPLATE_ARGS[/]
2044+
A JSON formatted string of arguments for the tokenizer's apply_chat_template, e.g. '{"enable_thinking":false}'
20272045
"""
20282046

20292047
# End of Constants.py

tldw_chatbook/DB/Client_Media_DB_v2.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3951,6 +3951,50 @@ def search_media_by_keyword_for_embedding(self, keyword: str, limit: int = 1000)
39513951
return self.get_media_by_ids_for_embedding(media_ids_found)
39523952
# ============================= End of Embedding-related Functions ===================================================
39533953

3954+
# ============================= Chat UI Functions for Search ===================================================
3955+
def fetch_keywords_for_media_batch(self, media_ids: List[int]) -> Dict[int, List[str]]:
3956+
"""
3957+
Fetches keywords associated with a batch of media IDs.
3958+
3959+
Args:
3960+
media_ids: List of media IDs to fetch keywords for.
3961+
3962+
Returns:
3963+
A dictionary mapping media IDs to lists of associated keywords.
3964+
3965+
Raises:
3966+
DatabaseError: If a database error occurs.
3967+
"""
3968+
if not media_ids:
3969+
return {}
3970+
3971+
placeholders = ','.join('?' * len(media_ids))
3972+
query = f"""
3973+
SELECT mk.media_id, k.keyword
3974+
FROM MediaKeywords mk
3975+
JOIN Keywords k ON mk.keyword_id = k.id
3976+
WHERE mk.media_id IN ({placeholders}) AND k.deleted = 0
3977+
"""
3978+
try:
3979+
conn = self.get_connection()
3980+
cursor = conn.execute(query, tuple(media_ids))
3981+
results = cursor.fetchall()
3982+
3983+
# Group keywords by media ID
3984+
keywords_by_media = {}
3985+
for media_id, keyword in results:
3986+
if media_id not in keywords_by_media:
3987+
keywords_by_media[media_id] = []
3988+
keywords_by_media[media_id].append(keyword)
3989+
3990+
return keywords_by_media
3991+
except sqlite3.Error as e:
3992+
logger.error(f"Error fetching keywords for media batch: {e}", exc_info=True)
3993+
raise DatabaseError(f"Failed to fetch keywords for media batch: {e}") from e
3994+
3995+
# ============================= End of Chat UI Functions for Search ===================================================
3996+
3997+
39543998

39553999
# =========================================================================
39564000
# Standalone Functions (REQUIRE db_instance passed explicitly)

0 commit comments

Comments
 (0)