Skip to content

Sync #70

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 14, 2025
Merged

Sync #70

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ dependencies = [

# --- Optional Dependencies ---
[project.optional-dependencies]
coding_map = [
"grep_ast",
"pygments",
"tqdm",

]
chunker = [
"langdetect",
"nltk",
Expand All @@ -54,6 +60,12 @@ chunker = [
"tqdm",
"jieba",
]
embeddings_rag = [
"torch",
"numpy",
"pydantic",
"transformers",
]
websearch = [
"lxml",
"bs4",#beautifulsoup
Expand Down
5 changes: 4 additions & 1 deletion tldw_chatbook/Character_Chat/Character_Chat_Lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
from PIL import Image # For image processing
from loguru import logger
# from PIL.Image import Image as PILImage # More specific for type hints if needed

# Configure logger with context
logger = logger.bind(module="Character_Chat_Lib")
#
# Local Imports
from tldw_chatbook.DB.ChaChaNotes_DB import CharactersRAGDB, CharactersRAGDBError, ConflictError, InputError
Expand Down Expand Up @@ -2583,4 +2586,4 @@ def find_messages_in_conversation(
# End of File
########################################################################################################################
def load_character_card_from_file(param):
return None
return None
31 changes: 17 additions & 14 deletions tldw_chatbook/Chat/Chat_Functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
import requests
from pydantic import BaseModel, Field

# Configure logger with context
logger = logger.bind(module="Chat_Functions")

#
# Local Imports
from .Chat_Deps import ChatBadRequestError, ChatConfigurationError, ChatAPIError, \
Expand Down Expand Up @@ -68,7 +71,7 @@ def approximate_token_count(history):
total_tokens = len(total_text.split())
return total_tokens
except Exception as e:
logging.error(f"Error calculating token count: {str(e)}")
logger.error(f"Error calculating token count: {str(e)}")
return 0

# FIXME - Validate below
Expand Down Expand Up @@ -570,13 +573,13 @@ def chat_api_call(
requests.exceptions.RequestException: For network errors during the request.
"""
endpoint_lower = api_endpoint.lower()
logging.info(f"Chat API Call - Routing to endpoint: {endpoint_lower}")
logger.info(f"Chat API Call - Routing to endpoint: {endpoint_lower}")
log_counter("chat_api_call_attempt", labels={"api_endpoint": endpoint_lower})
start_time = time.time()

handler = API_CALL_HANDLERS.get(endpoint_lower)
if not handler:
logging.error(f"Unsupported API endpoint requested: {api_endpoint}")
logger.error(f"Unsupported API endpoint requested: {api_endpoint}")
raise ValueError(f"Unsupported API endpoint: {api_endpoint}")

params_map = PROVIDER_PARAM_MAP.get(endpoint_lower, {})
Expand Down Expand Up @@ -620,22 +623,22 @@ def chat_api_call(
pass # Specific handling for Cohere's prompt is assumed to be within chat_with_cohere

if call_kwargs.get(params_map.get('api_key', 'api_key')) and isinstance(call_kwargs.get(params_map.get('api_key', 'api_key')), str) and len(call_kwargs.get(params_map.get('api_key', 'api_key'))) > 8:
logging.info(f"Debug - Chat API Call - API Key: {call_kwargs[params_map.get('api_key', 'api_key')][:4]}...{call_kwargs[params_map.get('api_key', 'api_key')][-4:]}")
logger.info(f"Debug - Chat API Call - API Key: {call_kwargs[params_map.get('api_key', 'api_key')][:4]}...{call_kwargs[params_map.get('api_key', 'api_key')][-4:]}")

try:
logging.debug(f"Calling handler {handler.__name__} with kwargs: { {k: (type(v) if k != params_map.get('api_key') else 'key_hidden') for k,v in call_kwargs.items()} }")
logger.debug(f"Calling handler {handler.__name__} with kwargs: { {k: (type(v) if k != params_map.get('api_key') else 'key_hidden') for k,v in call_kwargs.items()} }")
response = handler(**call_kwargs)

call_duration = time.time() - start_time
log_histogram("chat_api_call_duration", call_duration, labels={"api_endpoint": endpoint_lower})
log_counter("chat_api_call_success", labels={"api_endpoint": endpoint_lower})

if isinstance(response, str):
logging.debug(f"Debug - Chat API Call - Response (first 500 chars): {response[:500]}...")
logger.debug(f"Debug - Chat API Call - Response (first 500 chars): {response[:500]}...")
elif hasattr(response, '__iter__') and not isinstance(response, (str, bytes, dict)):
logging.debug(f"Debug - Chat API Call - Response: Streaming Generator")
logger.debug(f"Debug - Chat API Call - Response: Streaming Generator")
else:
logging.debug(f"Debug - Chat API Call - Response Type: {type(response)}")
logger.debug(f"Debug - Chat API Call - Response Type: {type(response)}")
return response

# --- Exception Mapping (copied from your original, ensure it's still relevant) ---
Expand All @@ -646,9 +649,9 @@ def chat_api_call(

# Log safely first
try:
logging.error("%s. Details: %s", log_message_base, error_text[:500], exc_info=False)
logger.error("%s. Details: %s", log_message_base, error_text[:500], exc_info=False)
except Exception as log_e:
logging.error(f"Error during logging HTTPError details: {log_e}")
logger.error(f"Error during logging HTTPError details: {log_e}")

detail_message = f"API call to {endpoint_lower} failed with status {status_code}. Response: {error_text[:200]}"
if status_code == 401:
Expand All @@ -669,26 +672,26 @@ def chat_api_call(
message=f"Unexpected HTTP status {status_code} from {endpoint_lower}. Detail: {error_text[:200]}",
status_code=status_code)
except requests.exceptions.RequestException as e:
logging.error(f"Network error connecting to {endpoint_lower}: {e}", exc_info=False)
logger.error(f"Network error connecting to {endpoint_lower}: {e}", exc_info=False)
raise ChatProviderError(provider=endpoint_lower, message=f"Network error: {e}", status_code=504)
except (ChatAuthenticationError, ChatRateLimitError, ChatBadRequestError, ChatConfigurationError, ChatProviderError,
ChatAPIError) as e_chat_direct:
# This catches cases where the handler itself has already processed an error
# (e.g. non-HTTP error, or it decided to raise a specific Chat*Error type)
# and raises one of our custom exceptions.
logging.error(
logger.error(
f"Handler for {endpoint_lower} directly raised: {type(e_chat_direct).__name__} - {e_chat_direct.message}",
exc_info=True if e_chat_direct.status_code >= 500 else False)
raise e_chat_direct # Re-raise the specific error
except (ValueError, TypeError, KeyError) as e:
logging.error(f"Value/Type/Key error during chat API call setup for {endpoint_lower}: {e}", exc_info=True)
logger.error(f"Value/Type/Key error during chat API call setup for {endpoint_lower}: {e}", exc_info=True)
error_type = "Configuration/Parameter Error"
if "Unsupported API endpoint" in str(e):
raise ChatConfigurationError(provider=endpoint_lower, message=f"Unsupported API endpoint: {endpoint_lower}")
else:
raise ChatBadRequestError(provider=endpoint_lower, message=f"{error_type} for {endpoint_lower}: {e}")
except Exception as e:
logging.exception(
logger.exception(
f"Unexpected internal error in chat_api_call for {endpoint_lower}: {e}")
raise ChatAPIError(provider=endpoint_lower,
message=f"An unexpected internal error occurred in chat_api_call for {endpoint_lower}: {str(e)}",
Expand Down
39 changes: 29 additions & 10 deletions tldw_chatbook/Chunking/Chunk_Lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,13 +325,14 @@ def chunk_text(self,


def _chunk_text_by_words(self, text: str, max_words: int, overlap: int, language: str) -> List[str]:
logger.debug(f"Chunking by words: max_words={max_words}, overlap={overlap}, language='{language}'")
logger.info(f"Chunking by words: max_words={max_words}, overlap={overlap}, language='{language}'")
# Language-specific word tokenization
words: List[str]
if language.startswith('zh'): # Chinese
try:
import jieba
words = list(jieba.cut(text))
logger.debug(f"Using jieba for Chinese word tokenization, found {len(words)} words")
except ImportError:
logger.warning("jieba library not found for Chinese word tokenization. Falling back to space splitting.")
words = text.split()
Expand Down Expand Up @@ -371,11 +372,13 @@ def _chunk_text_by_words(self, text: str, max_words: int, overlap: int, language
chunks.append(' '.join(chunk_words))
logger.debug(f"Created word chunk {len(chunks)} with {len(chunk_words)} words")

return self._post_process_chunks(chunks)
processed_chunks = self._post_process_chunks(chunks)
logger.info(f"Word chunking complete: created {len(processed_chunks)} chunks from {len(words)} words")
return processed_chunks


def _chunk_text_by_sentences(self, text: str, max_sentences: int, overlap: int, language: str) -> List[str]:
logger.debug(f"Chunking by sentences: max_sentences={max_sentences}, overlap={overlap}, lang='{language}'")
logger.info(f"Chunking by sentences: max_sentences={max_sentences}, overlap={overlap}, lang='{language}'")
sentences: List[str]

if language.startswith('zh'):
Expand Down Expand Up @@ -445,11 +448,15 @@ def _chunk_text_by_sentences(self, text: str, max_sentences: int, overlap: int,
for i in range(0, len(sentences), step):
chunk_sentences = sentences[i : i + max_sentences]
chunks.append(' '.join(chunk_sentences))
return self._post_process_chunks(chunks)
logger.debug(f"Created sentence chunk {len(chunks)} with {len(chunk_sentences)} sentences")

processed_chunks = self._post_process_chunks(chunks)
logger.info(f"Sentence chunking complete: created {len(processed_chunks)} chunks from {len(sentences)} sentences")
return processed_chunks


def _chunk_text_by_paragraphs(self, text: str, max_paragraphs: int, overlap: int) -> List[str]:
logger.debug(f"Chunking by paragraphs: max_paragraphs={max_paragraphs}, overlap={overlap}")
logger.info(f"Chunking by paragraphs: max_paragraphs={max_paragraphs}, overlap={overlap}")
# Split by one or more empty lines (common paragraph delimiter)
paragraphs = re.split(r'\n\s*\n+', text)
paragraphs = [p.strip() for p in paragraphs if p.strip()] # Remove empty paragraphs
Expand All @@ -470,7 +477,11 @@ def _chunk_text_by_paragraphs(self, text: str, max_paragraphs: int, overlap: int
for i in range(0, len(paragraphs), step):
chunk_paragraphs = paragraphs[i : i + max_paragraphs]
chunks.append('\n\n'.join(chunk_paragraphs)) # Join with double newline to preserve paragraph structure
return self._post_process_chunks(chunks) # post_process_chunks strips leading/trailing, which is fine
logger.debug(f"Created paragraph chunk {len(chunks)} with {len(chunk_paragraphs)} paragraphs")

processed_chunks = self._post_process_chunks(chunks) # post_process_chunks strips leading/trailing, which is fine
logger.info(f"Paragraph chunking complete: created {len(processed_chunks)} chunks from {len(paragraphs)} paragraphs")
return processed_chunks


def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> List[str]:
Expand All @@ -479,7 +490,7 @@ def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> Lis
logger.error("Tokenizer not available for token-based chunking.")
raise ChunkingError("Tokenizer not loaded, cannot use 'tokens' chunking method.")

logger.debug(f"Chunking by tokens: max_tokens={max_tokens}, overlap_tokens={overlap} (token overlap)")
logger.info(f"Chunking by tokens: max_tokens={max_tokens}, overlap_tokens={overlap} (token overlap)")
if max_tokens <= 0:
logger.warning("max_tokens must be positive. Returning single chunk or empty.")
return [text] if text.strip() else []
Expand All @@ -501,7 +512,11 @@ def _chunk_text_by_tokens(self, text: str, max_tokens: int, overlap: int) -> Lis
chunk_token_ids = tokens[i : i + max_tokens]
chunk_text = self.tokenizer.decode(chunk_token_ids, skip_special_tokens=True) # skip_special_tokens might be an option
chunks.append(chunk_text)
return self._post_process_chunks(chunks)
logger.debug(f"Created token chunk {len(chunks)} with {len(chunk_token_ids)} tokens")

processed_chunks = self._post_process_chunks(chunks)
logger.info(f"Token chunking complete: created {len(processed_chunks)} chunks from {len(tokens)} tokens")
return processed_chunks


# --- Adaptive Chunking Methods ---
Expand Down Expand Up @@ -1357,8 +1372,9 @@ def improved_chunking_process(text: str,
llm_call_function_for_chunker: Optional[Callable] = None,
llm_api_config_for_chunker: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
logger.debug("Improved chunking process started...")
logger.info("Improved chunking process started...")
logger.debug(f"Received chunk_options_dict: {chunk_options_dict}")
logger.debug(f"Text length: {len(text)} characters, tokenizer: {tokenizer_name_or_path}")

chunker_instance = Chunker(options=chunk_options_dict,
tokenizer_name_or_path=tokenizer_name_or_path,
Expand Down Expand Up @@ -1421,8 +1437,10 @@ def improved_chunking_process(text: str,

chunks_with_metadata_list = []
total_chunks_count = len(raw_chunks)
logger.info(f"Processing {total_chunks_count} chunks for metadata enrichment")
try:
for i, chunk_item in enumerate(raw_chunks):
logger.debug(f"Processing chunk {i+1}/{total_chunks_count}")
actual_text_content: str
is_json_chunk = False
chunk_specific_metadata = {} # Initialize
Expand Down Expand Up @@ -1464,6 +1482,7 @@ def improved_chunking_process(text: str,
})

logger.debug(f"Successfully created metadata for all {len(chunks_with_metadata_list)} chunks")
logger.info(f"Improved chunking process completed: {len(chunks_with_metadata_list)} chunks created using method '{effective_options['method']}', language: {effective_options.get('language', 'unknown')}")
return chunks_with_metadata_list
except Exception as e:
logger.error(f"Error creating chunk metadata: {e}", exc_info=True)
Expand All @@ -1482,4 +1501,4 @@ def load_document(file_path: str) -> str:
raise
except Exception as e:
logger.error(f"Error loading document {file_path}: {e}")
raise
raise
Loading
Loading