Skip to content

Commit c94557a

Browse files
authored
Merge pull request #67 from rmusser01/dev
Make the chunking-related lib reqs imported only if available
2 parents 7b59218 + 9d8344b commit c94557a

File tree

2 files changed

+91
-81
lines changed

2 files changed

+91
-81
lines changed

pyproject.toml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,12 @@ dependencies = [
3535
"loguru",
3636
"textual>=3.3.0",
3737
"requests",
38-
"rich",
39-
"Pillow",
38+
"rich",
39+
"pillow",
4040
"PyYAML",
4141
"pydantic",
4242
"psutil",
43-
# "toml", # Only if writing TOML. For reading in Python 3.11+, tomllib is built-in.
43+
"toml",
4444
]
4545

4646
# --- Optional Dependencies ---
@@ -49,6 +49,10 @@ chunker = [
4949
"langdetect",
5050
"nltk",
5151
"scikit-learn",
52+
"fugashi",
53+
"transformers",
54+
"tqdm",
55+
"jieba",
5256
]
5357
websearch = [
5458
"lxml",

tldw_chatbook/LLM_Calls/Summarization_General_Lib.py

Lines changed: 84 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,12 @@
2323
from typing import Optional, Union, Generator, Any, Dict, List, Callable
2424
#
2525
# 3rd-Party Imports
26+
from loguru import logger
2627
import requests
2728
from requests.adapters import HTTPAdapter
2829
from urllib3 import Retry
2930
#
3031
# Import Local
31-
from tldw_chatbook.Chunking.Chunk_Lib import (
32-
improved_chunking_process
33-
)
3432
from tldw_chatbook.LLM_Calls.Local_Summarization_Lib import (
3533
summarize_with_llama,
3634
summarize_with_kobold,
@@ -44,7 +42,14 @@
4442
)
4543
from tldw_chatbook.Logging_Config import logging
4644
from tldw_chatbook.config import get_cli_setting
47-
45+
try:
46+
from tldw_chatbook.Chunking.Chunk_Lib import (
47+
improved_chunking_process
48+
)
49+
CHUNKER_AVAILABLE = True
50+
except ImportError:
51+
logger.warning("Failed to import chunking library. Will not be available.")
52+
CHUNKER_AVAILABLE = False
4853

4954
# FIXME
5055
def load_and_log_configs():
@@ -418,82 +423,83 @@ def consume_generator(gen):
418423
default_chunk_opts = {'method': 'sentences', 'max_size': 500, 'overlap': 200}
419424
current_chunk_options = chunk_options if isinstance(chunk_options, dict) else default_chunk_opts
420425

421-
if recursive_summarization:
422-
logging.info("Performing recursive summarization.")
423-
chunks_data = improved_chunking_process(text_content, current_chunk_options) # Renamed variable for clarity
424-
if not chunks_data:
425-
logging.warning("Recursive summarization: Chunking produced no chunks.")
426-
return "Error: Recursive summarization failed - no chunks generated."
427-
428-
# Extract just the text from the chunk data
429-
text_chunks = [chunk['text'] for chunk in chunks_data]
430-
logging.debug(f"Generated {len(text_chunks)} text chunks for recursive summarization.")
431-
432-
# Define the summarizer function for recursive_summarize_chunks
433-
# It must accept ONE argument (the text) and return the summary string.
434-
# It captures necessary variables (api_name, key, temp, prompts, etc.) from the outer scope (closure).
435-
# It must handle potential errors from the API call and return an error string if needed.
436-
def recursive_step_processor(text_to_summarize: str) -> str:
437-
logging.debug(f"recursive_step_processor called with text length: {len(text_to_summarize)}")
438-
# Force non-streaming for internal steps and consume immediately
439-
api_result = _dispatch_to_api(
440-
text_to_summarize,
441-
custom_prompt_arg, # Custom prompt is handled by _dispatch_to_api
442-
api_name,
443-
api_key,
444-
temp,
445-
system_message, # System message is handled by _dispatch_to_api
446-
streaming=False # IMPORTANT: Force non-streaming for internal recursive steps
426+
if CHUNKER_AVAILABLE == True:
427+
if recursive_summarization:
428+
logging.info("Performing recursive summarization.")
429+
chunks_data = improved_chunking_process(text_content, current_chunk_options) # Renamed variable for clarity
430+
if not chunks_data:
431+
logging.warning("Recursive summarization: Chunking produced no chunks.")
432+
return "Error: Recursive summarization failed - no chunks generated."
433+
434+
# Extract just the text from the chunk data
435+
text_chunks = [chunk['text'] for chunk in chunks_data]
436+
logging.debug(f"Generated {len(text_chunks)} text chunks for recursive summarization.")
437+
438+
# Define the summarizer function for recursive_summarize_chunks
439+
# It must accept ONE argument (the text) and return the summary string.
440+
# It captures necessary variables (api_name, key, temp, prompts, etc.) from the outer scope (closure).
441+
# It must handle potential errors from the API call and return an error string if needed.
442+
def recursive_step_processor(text_to_summarize: str) -> str:
443+
logging.debug(f"recursive_step_processor called with text length: {len(text_to_summarize)}")
444+
# Force non-streaming for internal steps and consume immediately
445+
api_result = _dispatch_to_api(
446+
text_to_summarize,
447+
custom_prompt_arg, # Custom prompt is handled by _dispatch_to_api
448+
api_name,
449+
api_key,
450+
temp,
451+
system_message, # System message is handled by _dispatch_to_api
452+
streaming=False # IMPORTANT: Force non-streaming for internal recursive steps
453+
)
454+
# consume_generator handles both strings and generators, returning a string
455+
processed_result = consume_generator(api_result)
456+
457+
# Ensure the result is a string (consume_generator should do this)
458+
if not isinstance(processed_result, str):
459+
logging.error(f"API dispatch/consumption did not return a string. Got: {type(processed_result)}")
460+
# Return an error string that recursive_summarize_chunks can detect
461+
return f"Error: Internal summarization step failed to produce string output (got {type(processed_result)})"
462+
463+
logging.debug(f"recursive_step_processor finished. Result length: {len(processed_result)}")
464+
# Return the result string (which could be a summary or an error message from consume_generator)
465+
return processed_result
466+
467+
# Call the simplified recursive_summarize_chunks utility
468+
# It now only needs the list of text chunks and the processing function
469+
final_result = recursive_summarize_chunks(
470+
chunks=text_chunks,
471+
summarize_func=recursive_step_processor
447472
)
448-
# consume_generator handles both strings and generators, returning a string
449-
processed_result = consume_generator(api_result)
450-
451-
# Ensure the result is a string (consume_generator should do this)
452-
if not isinstance(processed_result, str):
453-
logging.error(f"API dispatch/consumption did not return a string. Got: {type(processed_result)}")
454-
# Return an error string that recursive_summarize_chunks can detect
455-
return f"Error: Internal summarization step failed to produce string output (got {type(processed_result)})"
456-
457-
logging.debug(f"recursive_step_processor finished. Result length: {len(processed_result)}")
458-
# Return the result string (which could be a summary or an error message from consume_generator)
459-
return processed_result
460-
461-
# Call the simplified recursive_summarize_chunks utility
462-
# It now only needs the list of text chunks and the processing function
463-
final_result = recursive_summarize_chunks(
464-
chunks=text_chunks,
465-
summarize_func=recursive_step_processor
466-
)
467-
# The result of recursive_summarize_chunks is now the final string summary or an error string
468-
469-
elif chunked_summarization:
470-
logging.info("Performing chunked summarization (summarize each, then combine).")
471-
chunks = improved_chunking_process(text_content, current_chunk_options)
472-
if not chunks:
473-
logging.warning("Chunked summarization: Chunking produced no chunks.")
474-
return "Error: Chunked summarization failed - no chunks generated."
475-
logging.debug(f"Generated {len(chunks)} chunks for chunked summarization.")
476-
477-
chunk_summaries = []
478-
for i, chunk in enumerate(chunks):
479-
logging.debug(f"Summarizing chunk {i+1}/{len(chunks)}")
480-
# Summarize each chunk - force non-streaming for API call
481-
chunk_summary_result = _dispatch_to_api(
482-
chunk['text'], custom_prompt_arg, api_name, api_key,
483-
temp, system_message, streaming=False # Force non-streaming
484-
)
485-
# Consume generator immediately
486-
processed_chunk_summary = consume_generator(chunk_summary_result)
487-
488-
if isinstance(processed_chunk_summary, str) and not processed_chunk_summary.startswith("Error:"):
489-
chunk_summaries.append(processed_chunk_summary)
490-
else:
491-
error_detail = processed_chunk_summary if isinstance(processed_chunk_summary, str) else "Unknown error"
492-
logging.warning(f"Failed to summarize chunk {i+1}: {error_detail}")
493-
chunk_summaries.append(f"[Error summarizing chunk {i+1}: {error_detail}]") # Add error placeholder
473+
# The result of recursive_summarize_chunks is now the final string summary or an error string
474+
475+
elif chunked_summarization:
476+
logging.info("Performing chunked summarization (summarize each, then combine).")
477+
chunks = improved_chunking_process(text_content, current_chunk_options)
478+
if not chunks:
479+
logging.warning("Chunked summarization: Chunking produced no chunks.")
480+
return "Error: Chunked summarization failed - no chunks generated."
481+
logging.debug(f"Generated {len(chunks)} chunks for chunked summarization.")
482+
483+
chunk_summaries = []
484+
for i, chunk in enumerate(chunks):
485+
logging.debug(f"Summarizing chunk {i+1}/{len(chunks)}")
486+
# Summarize each chunk - force non-streaming for API call
487+
chunk_summary_result = _dispatch_to_api(
488+
chunk['text'], custom_prompt_arg, api_name, api_key,
489+
temp, system_message, streaming=False # Force non-streaming
490+
)
491+
# Consume generator immediately
492+
processed_chunk_summary = consume_generator(chunk_summary_result)
493+
494+
if isinstance(processed_chunk_summary, str) and not processed_chunk_summary.startswith("Error:"):
495+
chunk_summaries.append(processed_chunk_summary)
496+
else:
497+
error_detail = processed_chunk_summary if isinstance(processed_chunk_summary, str) else "Unknown error"
498+
logging.warning(f"Failed to summarize chunk {i+1}: {error_detail}")
499+
chunk_summaries.append(f"[Error summarizing chunk {i+1}: {error_detail}]") # Add error placeholder
494500

495-
# Combine the summaries
496-
final_result = "\n\n---\n\n".join(chunk_summaries) # Join with a separator
501+
# Combine the summaries
502+
final_result = "\n\n---\n\n".join(chunk_summaries) # Join with a separator
497503

498504
else:
499505
# No chunking - direct summarization

0 commit comments

Comments
 (0)