Skip to content

Sync #65

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Jun 10, 2025
Merged

Sync #65

Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
fdda8d7
Media display
rmusser01 Jun 7, 2025
5a5c6b2
media search isn't broken anymore
rmusser01 Jun 7, 2025
fff4a9f
and that fixes the pytest req
rmusser01 Jun 7, 2025
a2d3b62
bugfix
rmusser01 Jun 8, 2025
9991c7d
media viewing
rmusser01 Jun 8, 2025
3345cdd
Update .gitignore
rmusser01 Jun 8, 2025
e974295
Media window refactor
rmusser01 Jun 8, 2025
5acb244
new media view + collapse button
rmusser01 Jun 8, 2025
c107bdb
f
rmusser01 Jun 8, 2025
9784e99
progress
rmusser01 Jun 8, 2025
2d5906d
progress
rmusser01 Jun 8, 2025
aa511ae
cleanup
rmusser01 Jun 8, 2025
f62f4ed
fix, but weird bug...
rmusser01 Jun 8, 2025
2fa2a9c
fml
rmusser01 Jun 8, 2025
9f12dbb
rewrote chunking lib to support new logging library
rmusser01 Jun 8, 2025
53cbe35
fix chunking and summarization logging
rmusser01 Jun 8, 2025
99c2d69
webscraper
rmusser01 Jun 8, 2025
5baeabe
and bug fixed.
rmusser01 Jun 8, 2025
bf74a65
scraper
rmusser01 Jun 8, 2025
7af882b
eh
rmusser01 Jun 9, 2025
6115894
progress
rmusser01 Jun 9, 2025
c0650f8
media search in chat works
rmusser01 Jun 9, 2025
f69eb89
media search bugs
rmusser01 Jun 9, 2025
5f0fddd
Update Client_Media_DB_v2.py
rmusser01 Jun 9, 2025
c91c227
Update app.py
rmusser01 Jun 9, 2025
f01f47b
Update chat_right_sidebar.py
rmusser01 Jun 9, 2025
bf0efe4
Update chat_right_sidebar.py
rmusser01 Jun 9, 2025
99f7b50
Update chat_right_sidebar.py
rmusser01 Jun 9, 2025
ab15775
Update chat_right_sidebar.py
rmusser01 Jun 9, 2025
e4b3ad7
CSS and check for on-load
rmusser01 Jun 10, 2025
07011fd
Update tldw_cli.tcss
rmusser01 Jun 10, 2025
c0bc64c
Update tldw_cli.tcss
rmusser01 Jun 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
webscraper
  • Loading branch information
rmusser01 committed Jun 8, 2025
commit 99c2d69491f23338e98bbe3bd93b76e5f1849d0a
15 changes: 8 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -45,9 +45,11 @@ dependencies = [

# --- Optional Dependencies ---
[project.optional-dependencies]
vllm = ["vllm"]
mlx = ["mlx-lm"]
transformers = ["transformers"]
chunker = [
"langdetect",
"nltk",
"scikit-learn",
]
websearch = [
"lxml",
"bs4",#beautifulsoup
@@ -58,11 +60,10 @@ websearch = [
"langdetect",
"nltk",
"scikit-learn",

]
# For groups like local_ingestion, local_rag, etc., if they have actual packages:
# local_ingestion = ["some-ingestion-package"]
# local_rag = ["some-rag-package"]
local_vllm = ["vllm"]
local_mlx = ["mlx-lm"]
local_transformers = ["transformers"]
dev = [ # Example for development dependencies
"pytest",
"textual-dev", # For Textual development tools
Empty file.
42 changes: 42 additions & 0 deletions tldw_chatbook/Web_Scraping/Article_Scraper/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# article_scraper/config.py
#
# Imports
from dataclasses import dataclass, field
from typing import List, Dict, Any
#
# Third-Party Imports
#
# Imports
#
#######################################################################################################################
#
# Functions:

@dataclass
class ScraperConfig:
user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
request_timeout_ms: int = 60000 # 60 seconds
retries: int = 3
stealth: bool = True
# Time to wait after page load if stealth is enabled
stealth_wait_ms: int = 5000

# Trafilatura settings
include_comments: bool = False
include_tables: bool = False
include_images: bool = False


@dataclass
class ProcessorConfig:
api_name: str
api_key: str
summarize: bool = False
custom_prompt: str = "Please provide a concise summary of the following article."
system_message: str = "You are an expert summarization assistant."
temperature: float = 0.7
keywords: List[str] = field(default_factory=list)

#
# End of article_scraper/config.py
#######################################################################################################################
Empty file.
Empty file.
Empty file.
Empty file.
125 changes: 125 additions & 0 deletions tldw_chatbook/Web_Scraping/Article_Scraper/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# article_scraper/utils.py
#
# Imports
import hashlib
import json
from datetime import datetime
from typing import Any, Dict, Optional, Tuple
#
# Third-Party Libraries
#
# Local Imports
#
#######################################################################################################################
#
# Functions:

class ContentMetadataHandler:
"""Handles the addition and parsing of metadata for scraped content."""
METADATA_START = "[METADATA]"
METADATA_END = "[/METADATA]"

@staticmethod
def format_content_with_metadata(
url: str,
content: str,
pipeline: str,
additional_metadata: Optional[Dict[str, Any]] = None
) -> str:
metadata = {
"url": url,
"ingestion_date": datetime.now().isoformat(),
"content_hash": hashlib.sha256(content.encode('utf-8')).hexdigest(),
"scraping_pipeline": pipeline,
**(additional_metadata or {})
}
metadata_str = json.dumps(metadata, indent=2)
return f"{ContentMetadataHandler.METADATA_START}\n{metadata_str}\n{ContentMetadataHandler.METADATA_END}\n\n{content}"

@staticmethod
def extract_metadata(content_with_meta: str) -> Tuple[Optional[Dict[str, Any]], str]:
"""Extracts metadata and returns (metadata_dict, clean_content)."""
try:
start_idx = content_with_meta.index(ContentMetadataHandler.METADATA_START)
end_idx = content_with_meta.index(ContentMetadataHandler.METADATA_END)

metadata_str = content_with_meta[start_idx + len(ContentMetadataHandler.METADATA_START):end_idx].strip()
metadata = json.loads(metadata_str)

clean_content = content_with_meta[end_idx + len(ContentMetadataHandler.METADATA_END):].strip()

return metadata, clean_content
except (ValueError, json.JSONDecodeError):
return None, content_with_meta

# ... other methods from the original class are good ...


@staticmethod
def has_metadata(content: str) -> bool:
"""
Check if content contains metadata.
Args:
content: The content to check
Returns:
bool: True if metadata is present
"""
return (ContentMetadataHandler.METADATA_START in content and
ContentMetadataHandler.METADATA_END in content)

@staticmethod
def strip_metadata(content: str) -> str:
"""
Remove metadata from content if present.
Args:
content: The content to strip metadata from
Returns:
Content without metadata
"""
try:
metadata_end = content.index(ContentMetadataHandler.METADATA_END)
return content[metadata_end + len(ContentMetadataHandler.METADATA_END):].strip()
except ValueError:
return content

@staticmethod
def get_content_hash(content: str) -> str:
"""
Get hash of content without metadata.
Args:
content: The content to hash
Returns:
SHA-256 hash of the clean content
"""
clean_content = ContentMetadataHandler.strip_metadata(content)
return hashlib.sha256(clean_content.encode('utf-8')).hexdigest()

@staticmethod
def content_changed(old_content: str, new_content: str) -> bool:
"""
Check if content has changed by comparing hashes.
Args:
old_content: Previous version of content
new_content: New version of content
Returns:
bool: True if content has changed
"""
old_hash = ContentMetadataHandler.get_content_hash(old_content)
new_hash = ContentMetadataHandler.get_content_hash(new_content)
return old_hash != new_hash



def convert_html_to_markdown(html: str) -> str:
"""A simple HTML to Markdown converter."""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
return soup.get_text(separator='\n\n').strip()