Skip to content

Commit d5dbf34

Browse files
committed
Migrate content from tldw
1 parent edcd56f commit d5dbf34

File tree

3 files changed

+523
-0
lines changed

3 files changed

+523
-0
lines changed

Email_Processing.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# email_processing.py
2+
3+
import email
4+
from email.utils import parsedate_to_datetime, getaddresses
5+
import re
6+
from typing import Dict, Any, List
7+
from bs4 import BeautifulSoup
8+
import spacy
9+
from sklearn.feature_extraction.text import TfidfVectorizer
10+
from sklearn.metrics.pairwise import cosine_similarity
11+
import numpy as np
12+
from gensim import corpora
13+
from gensim.models import LdaModel
14+
from gensim.parsing.preprocessing import STOPWORDS
15+
from gensim.utils import simple_preprocess
16+
from textblob import TextBlob
17+
18+
nlp = spacy.load("en_core_web_sm")
19+
20+
#contains all the functions for parsing emails, extracting metadata, calculating relevance scores, performing entity recognition, topic clustering, sentiment analysis, and keyword extraction.
21+
22+
23+
def parse_email(email_content: str) -> Dict[str, Any]:
24+
msg = email.message_from_string(email_content)
25+
26+
body = ""
27+
html_body = ""
28+
if msg.is_multipart():
29+
for part in msg.walk():
30+
if part.get_content_type() == "text/plain":
31+
body = part.get_payload(decode=True).decode()
32+
elif part.get_content_type() == "text/html":
33+
html_body = part.get_payload(decode=True).decode()
34+
else:
35+
body = msg.get_payload(decode=True).decode()
36+
37+
if not body and html_body:
38+
body = BeautifulSoup(html_body, "html.parser").get_text()
39+
40+
return {
41+
'subject': msg['subject'],
42+
'from': msg['from'],
43+
'to': msg['to'],
44+
'cc': msg['cc'],
45+
'date': parsedate_to_datetime(msg['date']),
46+
'body': body,
47+
'message_id': msg['message-id'],
48+
'in_reply_to': msg['in-reply-to'],
49+
'references': msg['references']
50+
}
51+
52+
53+
def extract_metadata(parsed_email: Dict[str, Any]) -> Dict[str, Any]:
54+
metadata = {
55+
'subject': parsed_email['subject'],
56+
'from': getaddresses([parsed_email['from']]),
57+
'to': getaddresses([parsed_email['to']]),
58+
'cc': getaddresses([parsed_email['cc']]) if parsed_email['cc'] else [],
59+
'date': parsed_email['date'],
60+
'email_addresses': re.findall(r'[\w\.-]+@[\w\.-]+', parsed_email['body']),
61+
'urls': re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
62+
parsed_email['body']),
63+
'message_id': parsed_email['message_id'],
64+
'in_reply_to': parsed_email['in_reply_to'],
65+
'references': parsed_email['references']
66+
}
67+
return metadata
68+
69+
70+
def calculate_relevance_score(email_body: str, query: str) -> float:
71+
vectorizer = TfidfVectorizer()
72+
tfidf_matrix = vectorizer.fit_transform([email_body, query])
73+
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
74+
return float(cosine_sim[0][0])
75+
76+
77+
def extract_entities(text: str) -> Dict[str, list]:
78+
doc = nlp(text)
79+
entities = {
80+
'PERSON': [],
81+
'ORG': [],
82+
'GPE': [], # Geopolitical Entity
83+
'DATE': [],
84+
'MONEY': []
85+
}
86+
for ent in doc.ents:
87+
if ent.label_ in entities:
88+
entities[ent.label_].append(ent.text)
89+
return entities
90+
91+
92+
def perform_topic_clustering(email_bodies: List[str], num_topics: int = 5) -> List[Dict[str, Any]]:
93+
def preprocess(text):
94+
return [token for token in simple_preprocess(text) if token not in STOPWORDS]
95+
96+
processed_docs = [preprocess(doc) for doc in email_bodies]
97+
dictionary = corpora.Dictionary(processed_docs)
98+
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
99+
100+
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100,
101+
update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
102+
103+
topics = lda_model.print_topics(num_words=10)
104+
return [{'id': topic[0], 'words': dict(word.split('*') for word in topic[1].split(' + '))} for topic in topics]
105+
106+
107+
def analyze_email_thread(emails: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
108+
thread_map = {}
109+
for email in emails:
110+
message_id = email['metadata']['message_id']
111+
in_reply_to = email['metadata']['in_reply_to']
112+
if in_reply_to:
113+
if in_reply_to in thread_map:
114+
thread_map[in_reply_to]['replies'].append(email)
115+
else:
116+
thread_map[in_reply_to] = {'email': None, 'replies': [email]}
117+
if message_id:
118+
if message_id in thread_map:
119+
thread_map[message_id]['email'] = email
120+
else:
121+
thread_map[message_id] = {'email': email, 'replies': []}
122+
123+
threads = []
124+
for message_id, thread_info in thread_map.items():
125+
if not thread_info['email']['metadata']['in_reply_to']:
126+
threads.append(thread_info)
127+
128+
return threads
129+
130+
131+
def perform_sentiment_analysis(text: str) -> Dict[str, float]:
132+
blob = TextBlob(text)
133+
return {
134+
'polarity': blob.sentiment.polarity,
135+
'subjectivity': blob.sentiment.subjectivity
136+
}
137+
138+
139+
def extract_keywords(text: str, num_keywords: int = 10) -> List[str]:
140+
doc = nlp(text)
141+
keywords = []
142+
for token in doc:
143+
if not token.is_stop and not token.is_punct and token.pos_ in ['NOUN', 'PROPN', 'ADJ']:
144+
keywords.append(token.text.lower())
145+
return list(set(keywords))[:num_keywords]

Email_RAG.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# rag_system.py
2+
3+
import sqlite3
4+
from multiprocessing import Pool, cpu_count
5+
from typing import List, Dict, Any, Tuple
6+
7+
import numpy as np
8+
from sentence_transformers import SentenceTransformer
9+
10+
# Initialize the embedding model
11+
model = SentenceTransformer('all-MiniLM-L6-v2')
12+
13+
# SQLite database setup
14+
DB_PATH = 'email_analysis.db'
15+
16+
17+
def init_db():
18+
with sqlite3.connect(DB_PATH) as conn:
19+
conn.enable_load_extension(True)
20+
conn.load_extension("sqlite-vss")
21+
conn.execute('''
22+
CREATE TABLE IF NOT EXISTS email_embeddings (
23+
id INTEGER PRIMARY KEY,
24+
email_id TEXT UNIQUE,
25+
embedding BLOB
26+
)
27+
''')
28+
conn.execute('CREATE VIRTUAL TABLE IF NOT EXISTS vss_email_embeddings USING vss0(embedding(384))')
29+
conn.execute('CREATE INDEX IF NOT EXISTS idx_email_id ON email_embeddings(email_id)')
30+
31+
32+
def embed_single_email(email: Dict[str, Any]) -> Tuple[str, np.ndarray]:
33+
embedding = model.encode(email['parsed_email']['body'])
34+
return (email['parsed_email']['message_id'], embedding)
35+
36+
37+
def embed_emails_batch(emails: List[Dict[str, Any]], batch_size: int = 100):
38+
with Pool(processes=cpu_count()) as pool:
39+
results = []
40+
for i in range(0, len(emails), batch_size):
41+
batch = emails[i:i + batch_size]
42+
results.extend(pool.map(embed_single_email, batch))
43+
44+
with sqlite3.connect(DB_PATH) as conn:
45+
conn.executemany('INSERT OR REPLACE INTO email_embeddings (email_id, embedding) VALUES (?, ?)',
46+
[(email_id, embedding.tobytes()) for email_id, embedding in results])
47+
conn.executemany(
48+
'INSERT OR REPLACE INTO vss_email_embeddings (rowid, embedding) VALUES ((SELECT id FROM email_embeddings WHERE email_id = ?), ?)',
49+
[(email_id, embedding.tobytes()) for email_id, embedding in results])
50+
51+
52+
def retrieve_relevant_emails(query: str, k: int = 5) -> List[str]:
53+
query_vector = model.encode([query])
54+
with sqlite3.connect(DB_PATH) as conn:
55+
results = conn.execute('''
56+
SELECT email_embeddings.email_id
57+
FROM vss_email_embeddings
58+
JOIN email_embeddings ON vss_email_embeddings.rowid = email_embeddings.id
59+
WHERE vss_search(vss_email_embeddings.embedding, ?)
60+
LIMIT ?
61+
''', (query_vector[0].tobytes(), k)).fetchall()
62+
63+
return [email_id for (email_id,) in results]
64+
65+
66+
def generate_response(query: str, relevant_email_ids: List[str], api_key: str) -> str:
67+
import openai
68+
openai.api_key = api_key
69+
70+
with sqlite3.connect(DB_PATH) as conn:
71+
relevant_emails = conn.execute('''
72+
SELECT email_id, subject, sender, recipient, date, body
73+
FROM emails
74+
WHERE email_id IN ({})
75+
'''.format(','.join('?' * len(relevant_email_ids))), relevant_email_ids).fetchall()
76+
77+
context = "\n\n".join([
78+
f"Subject: {email[1]}\n"
79+
f"From: {email[2]}\n"
80+
f"To: {email[3]}\n"
81+
f"Date: {email[4]}\n"
82+
f"Body: {email[5][:500]}..."
83+
for email in relevant_emails
84+
])
85+
86+
prompt = f"""Based on the following email excerpts, please answer the question: "{query}"
87+
88+
Email Excerpts:
89+
{context}
90+
91+
Please provide a concise and informative answer based solely on the information given in these email excerpts. If the answer cannot be determined from the given information, please state that.
92+
93+
Answer:"""
94+
95+
try:
96+
response = openai.ChatCompletion.create(
97+
model="gpt-3.5-turbo",
98+
messages=[
99+
{"role": "system", "content": "You are a helpful assistant analyzing emails."},
100+
{"role": "user", "content": prompt}
101+
],
102+
max_tokens=150
103+
)
104+
return response.choices[0].message['content'].strip()
105+
except openai.error.OpenAIError as e:
106+
return f"Error generating response: {str(e)}"
107+
108+
109+
# Initialize the database when the module is imported
110+
init_db()

0 commit comments

Comments
 (0)