Skip to content

Commit a74dbef

Browse files
authored
Sync
Sync
2 parents 275eb3b + f2815a0 commit a74dbef

39 files changed

+6982
-1331
lines changed

pyproject.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ dependencies = [
4545

4646
# --- Optional Dependencies ---
4747
[project.optional-dependencies]
48+
coding_map = [
49+
"grep_ast",
50+
"pygments",
51+
"tqdm",
52+
53+
]
4854
chunker = [
4955
"langdetect",
5056
"nltk",
@@ -54,6 +60,12 @@ chunker = [
5460
"tqdm",
5561
"jieba",
5662
]
63+
embeddings_rag = [
64+
"torch",
65+
"numpy",
66+
"pydantic",
67+
"transformers",
68+
]
5769
websearch = [
5870
"lxml",
5971
"bs4",#beautifulsoup

tldw_chatbook/Character_Chat/Character_Chat_Lib.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
from PIL import Image # For image processing
1919
from loguru import logger
2020
# from PIL.Image import Image as PILImage # More specific for type hints if needed
21+
22+
# Configure logger with context
23+
logger = logger.bind(module="Character_Chat_Lib")
2124
#
2225
# Local Imports
2326
from tldw_chatbook.DB.ChaChaNotes_DB import CharactersRAGDB, CharactersRAGDBError, ConflictError, InputError
@@ -2583,4 +2586,4 @@ def find_messages_in_conversation(
25832586
# End of File
25842587
########################################################################################################################
25852588
def load_character_card_from_file(param):
2586-
return None
2589+
return None

tldw_chatbook/Chat/Chat_Functions.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
import requests
3333
from pydantic import BaseModel, Field
3434

35+
# Configure logger with context
36+
logger = logger.bind(module="Chat_Functions")
37+
3538
#
3639
# Local Imports
3740
from .Chat_Deps import ChatBadRequestError, ChatConfigurationError, ChatAPIError, \
@@ -68,7 +71,7 @@ def approximate_token_count(history):
6871
total_tokens = len(total_text.split())
6972
return total_tokens
7073
except Exception as e:
71-
logging.error(f"Error calculating token count: {str(e)}")
74+
logger.error(f"Error calculating token count: {str(e)}")
7275
return 0
7376

7477
# FIXME - Validate below
@@ -570,13 +573,13 @@ def chat_api_call(
570573
requests.exceptions.RequestException: For network errors during the request.
571574
"""
572575
endpoint_lower = api_endpoint.lower()
573-
logging.info(f"Chat API Call - Routing to endpoint: {endpoint_lower}")
576+
logger.info(f"Chat API Call - Routing to endpoint: {endpoint_lower}")
574577
log_counter("chat_api_call_attempt", labels={"api_endpoint": endpoint_lower})
575578
start_time = time.time()
576579

577580
handler = API_CALL_HANDLERS.get(endpoint_lower)
578581
if not handler:
579-
logging.error(f"Unsupported API endpoint requested: {api_endpoint}")
582+
logger.error(f"Unsupported API endpoint requested: {api_endpoint}")
580583
raise ValueError(f"Unsupported API endpoint: {api_endpoint}")
581584

582585
params_map = PROVIDER_PARAM_MAP.get(endpoint_lower, {})
@@ -620,22 +623,22 @@ def chat_api_call(
620623
pass # Specific handling for Cohere's prompt is assumed to be within chat_with_cohere
621624

622625
if call_kwargs.get(params_map.get('api_key', 'api_key')) and isinstance(call_kwargs.get(params_map.get('api_key', 'api_key')), str) and len(call_kwargs.get(params_map.get('api_key', 'api_key'))) > 8:
623-
logging.info(f"Debug - Chat API Call - API Key: {call_kwargs[params_map.get('api_key', 'api_key')][:4]}...{call_kwargs[params_map.get('api_key', 'api_key')][-4:]}")
626+
logger.info(f"Debug - Chat API Call - API Key: {call_kwargs[params_map.get('api_key', 'api_key')][:4]}...{call_kwargs[params_map.get('api_key', 'api_key')][-4:]}")
624627

625628
try:
626-
logging.debug(f"Calling handler {handler.__name__} with kwargs: { {k: (type(v) if k != params_map.get('api_key') else 'key_hidden') for k,v in call_kwargs.items()} }")
629+
logger.debug(f"Calling handler {handler.__name__} with kwargs: { {k: (type(v) if k != params_map.get('api_key') else 'key_hidden') for k,v in call_kwargs.items()} }")
627630
response = handler(**call_kwargs)
628631

629632
call_duration = time.time() - start_time
630633
log_histogram("chat_api_call_duration", call_duration, labels={"api_endpoint": endpoint_lower})
631634
log_counter("chat_api_call_success", labels={"api_endpoint": endpoint_lower})
632635

633636
if isinstance(response, str):
634-
logging.debug(f"Debug - Chat API Call - Response (first 500 chars): {response[:500]}...")
637+
logger.debug(f"Debug - Chat API Call - Response (first 500 chars): {response[:500]}...")
635638
elif hasattr(response, '__iter__') and not isinstance(response, (str, bytes, dict)):
636-
logging.debug(f"Debug - Chat API Call - Response: Streaming Generator")
639+
logger.debug(f"Debug - Chat API Call - Response: Streaming Generator")
637640
else:
638-
logging.debug(f"Debug - Chat API Call - Response Type: {type(response)}")
641+
logger.debug(f"Debug - Chat API Call - Response Type: {type(response)}")
639642
return response
640643

641644
# --- Exception Mapping (copied from your original, ensure it's still relevant) ---
@@ -646,9 +649,9 @@ def chat_api_call(
646649

647650
# Log safely first
648651
try:
649-
logging.error("%s. Details: %s", log_message_base, error_text[:500], exc_info=False)
652+
logger.error("%s. Details: %s", log_message_base, error_text[:500], exc_info=False)
650653
except Exception as log_e:
651-
logging.error(f"Error during logging HTTPError details: {log_e}")
654+
logger.error(f"Error during logging HTTPError details: {log_e}")
652655

653656
detail_message = f"API call to {endpoint_lower} failed with status {status_code}. Response: {error_text[:200]}"
654657
if status_code == 401:
@@ -669,26 +672,26 @@ def chat_api_call(
669672
message=f"Unexpected HTTP status {status_code} from {endpoint_lower}. Detail: {error_text[:200]}",
670673
status_code=status_code)
671674
except requests.exceptions.RequestException as e:
672-
logging.error(f"Network error connecting to {endpoint_lower}: {e}", exc_info=False)
675+
logger.error(f"Network error connecting to {endpoint_lower}: {e}", exc_info=False)
673676
raise ChatProviderError(provider=endpoint_lower, message=f"Network error: {e}", status_code=504)
674677
except (ChatAuthenticationError, ChatRateLimitError, ChatBadRequestError, ChatConfigurationError, ChatProviderError,
675678
ChatAPIError) as e_chat_direct:
676679
# This catches cases where the handler itself has already processed an error
677680
# (e.g. non-HTTP error, or it decided to raise a specific Chat*Error type)
678681
# and raises one of our custom exceptions.
679-
logging.error(
682+
logger.error(
680683
f"Handler for {endpoint_lower} directly raised: {type(e_chat_direct).__name__} - {e_chat_direct.message}",
681684
exc_info=True if e_chat_direct.status_code >= 500 else False)
682685
raise e_chat_direct # Re-raise the specific error
683686
except (ValueError, TypeError, KeyError) as e:
684-
logging.error(f"Value/Type/Key error during chat API call setup for {endpoint_lower}: {e}", exc_info=True)
687+
logger.error(f"Value/Type/Key error during chat API call setup for {endpoint_lower}: {e}", exc_info=True)
685688
error_type = "Configuration/Parameter Error"
686689
if "Unsupported API endpoint" in str(e):
687690
raise ChatConfigurationError(provider=endpoint_lower, message=f"Unsupported API endpoint: {endpoint_lower}")
688691
else:
689692
raise ChatBadRequestError(provider=endpoint_lower, message=f"{error_type} for {endpoint_lower}: {e}")
690693
except Exception as e:
691-
logging.exception(
694+
logger.exception(
692695
f"Unexpected internal error in chat_api_call for {endpoint_lower}: {e}")
693696
raise ChatAPIError(provider=endpoint_lower,
694697
message=f"An unexpected internal error occurred in chat_api_call for {endpoint_lower}: {str(e)}",

tldw_chatbook/Chunking/Chunk_Lib.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -325,13 +325,14 @@ def chunk_text(self,
325325

326326

327327
def _chunk_text_by_words(self, text: str, max_words: int, overlap: int, language: str) -> List[str]:
328-
logger.debug(f"Chunking by words: max_words={max_words}, overlap={overlap}, language='{language}'")
328+
logger.info(f"Chunking by words: max_words={max_words}, overlap={overlap}, language='{language}'")
329329
# Language-specific word tokenization
330330
words: List[str]
331331
if language.startswith('zh'): # Chinese
332332
try:
333333
import jieba
334334
words = list(jieba.cut(text))
335+
logger.debug(f"Using jieba for Chinese word tokenization, found {len(words)} words")
335336
except ImportError:
336337
logger.warning("jieba library not found for Chinese word tokenization. Falling back to space splitting.")
337338
words = text.split()
@@ -371,11 +372,13 @@ def _chunk_text_by_words(self, text: str, max_words: int, overlap: int, language
371372
chunks.append(' '.join(chunk_words))
372373
logger.debug(f"Created word chunk {len(chunks)} with {len(chunk_words)} words")
373374

374-
return self._post_process_chunks(chunks)
375+
processed_chunks = self._post_process_chunks(chunks)
376+
logger.info(f"Word chunking complete: created {len(processed_chunks)} chunks from {len(words)} words")
377+
return processed_chunks
375378

376379

377380
def _chunk_text_by_sentences(self, text: str, max_sentences: int, overlap: int, language: str) -> List[str]:
378-
logger.debug(f"Chunking by sentences: max_sentences={max_sentences}, overlap={overlap}, lang='{language}'")
381+
logger.info(f"Chunking by sentences: max_sentences={max_sentences}, overlap={overlap}, lang='{language}'")
379382
sentences: List[str]
380383

381384
if language.startswith('zh'):
@@ -445,11 +448,15 @@ def _chunk_text_by_sentences(self, text: str, max_sentences: int, overlap: int,
445448
for i in range(0, len(sentences), step):
446449
chunk_sentences = sentences[i : i + max_sentences]
447450
chunks.append(' '.join(chunk_sentences))
448-
return self._post_process_chunks(chunks)
451+
logger.debug(f"Created sentence chunk {len(chunks)} with {len(chunk_sentences)} sentences")
452+
453+
processed_chunks = self._post_process_chunks(chunks)
454+
logger.info(f"Sentence chunking complete: created {len(processed_chunks)} chunks from {len(sentences)} sentences")
455+
return processed_chunks
449456

450457

451458
def _chunk_text_by_paragraphs(self, text: str, max_paragraphs: int, overlap: int) -> List[str]:
452-
logger.debug(f"Chunking by paragraphs: max_paragraphs={max_paragraphs}, overlap={overlap}")
459+
logger.info(f"Chunking by paragraphs: max_paragraphs={max_paragraphs}, overlap={overlap}")
453460
# Split by one or more empty lines (common paragraph delimiter)
454461
paragraphs = re.split(r'\n\s*\n+', text)
455462
paragraphs = [p.strip() for p in paragraphs if p.strip()] # Remove empty paragraphs
@@ -470,7 +477,11 @@ def _chunk_text_by_paragraphs(self, text: str, max_paragraphs: int, overlap: int
470477
for i in range(0, len(paragraphs), step):
471478
chunk_paragraphs = paragraphs[i : i + max_paragraphs]
472479
chunks.append('\n\n'.join(chunk_paragraphs)) # Join with double newline to preserve paragraph structure
473-
return self._post_process_chunks(chunks) # post_process_chunks strips leading/trailing, which is fine
480+
logger.debug(f"Created paragraph chunk {len(chunks)} with {len(chunk_paragraphs)} paragraphs")
481+
482+
processed_chunks = self._post_process_chunks(chunks) # post_process_chunks strips leading/trailing, which is fine
483+
logger.info(f"Paragraph chunking complete: created {len(processed_chunks)} chunks from {len(paragraphs)} paragraphs")
484+
return processed_chunks
474485

475486

476487
def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> List[str]:
@@ -479,7 +490,7 @@ def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> Lis
479490
logger.error("Tokenizer not available for token-based chunking.")
480491
raise ChunkingError("Tokenizer not loaded, cannot use 'tokens' chunking method.")
481492

482-
logger.debug(f"Chunking by tokens: max_tokens={max_tokens}, overlap_tokens={overlap} (token overlap)")
493+
logger.info(f"Chunking by tokens: max_tokens={max_tokens}, overlap_tokens={overlap} (token overlap)")
483494
if max_tokens <= 0:
484495
logger.warning("max_tokens must be positive. Returning single chunk or empty.")
485496
return [text] if text.strip() else []
@@ -501,7 +512,11 @@ def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> Lis
501512
chunk_token_ids = tokens[i : i + max_tokens]
502513
chunk_text = self.tokenizer.decode(chunk_token_ids, skip_special_tokens=True) # skip_special_tokens might be an option
503514
chunks.append(chunk_text)
504-
return self._post_process_chunks(chunks)
515+
logger.debug(f"Created token chunk {len(chunks)} with {len(chunk_token_ids)} tokens")
516+
517+
processed_chunks = self._post_process_chunks(chunks)
518+
logger.info(f"Token chunking complete: created {len(processed_chunks)} chunks from {len(tokens)} tokens")
519+
return processed_chunks
505520

506521

507522
# --- Adaptive Chunking Methods ---
@@ -1357,8 +1372,9 @@ def improved_chunking_process(text: str,
13571372
llm_call_function_for_chunker: Optional[Callable] = None,
13581373
llm_api_config_for_chunker: Optional[Dict[str, Any]] = None
13591374
) -> List[Dict[str, Any]]:
1360-
logger.debug("Improved chunking process started...")
1375+
logger.info("Improved chunking process started...")
13611376
logger.debug(f"Received chunk_options_dict: {chunk_options_dict}")
1377+
logger.debug(f"Text length: {len(text)} characters, tokenizer: {tokenizer_name_or_path}")
13621378

13631379
chunker_instance = Chunker(options=chunk_options_dict,
13641380
tokenizer_name_or_path=tokenizer_name_or_path,
@@ -1421,8 +1437,10 @@ def improved_chunking_process(text: str,
14211437

14221438
chunks_with_metadata_list = []
14231439
total_chunks_count = len(raw_chunks)
1440+
logger.info(f"Processing {total_chunks_count} chunks for metadata enrichment")
14241441
try:
14251442
for i, chunk_item in enumerate(raw_chunks):
1443+
logger.debug(f"Processing chunk {i+1}/{total_chunks_count}")
14261444
actual_text_content: str
14271445
is_json_chunk = False
14281446
chunk_specific_metadata = {} # Initialize
@@ -1464,6 +1482,7 @@ def improved_chunking_process(text: str,
14641482
})
14651483

14661484
logger.debug(f"Successfully created metadata for all {len(chunks_with_metadata_list)} chunks")
1485+
logger.info(f"Improved chunking process completed: {len(chunks_with_metadata_list)} chunks created using method '{effective_options['method']}', language: {effective_options.get('language', 'unknown')}")
14671486
return chunks_with_metadata_list
14681487
except Exception as e:
14691488
logger.error(f"Error creating chunk metadata: {e}", exc_info=True)
@@ -1482,4 +1501,4 @@ def load_document(file_path: str) -> str:
14821501
raise
14831502
except Exception as e:
14841503
logger.error(f"Error loading document {file_path}: {e}")
1485-
raise
1504+
raise

0 commit comments

Comments
 (0)