Skip to content

Features/stateless cleanup #832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 31 additions & 14 deletions a.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,47 @@

# Define the test cases
test_case1 = LLMTestCase(
input="The chicken crossed the road, why?",
actual_output="Because he felt like it",
input="How is Artificial Intelligence (AI) being used to improve diagnostic accuracy in healthcare?",
actual_output="""AI is revolutionizing the field of education by leveraging machine learning algorithms and deep learning techniques to analyze complex textbook data.""",
)
test_case2 = LLMTestCase(
input="Knock knock, who's there?", actual_output="The chicken"
input="What ethical considerations arise with the implementation of AI in healthcare, and how can they be addressed?",
actual_output="""The integration of AI in construction brings forth several ethical considerations, including data privacy, algorithmic bias, and the transparency of AI decision-making processes. Data privacy is a major concern, as AI systems often require access to large datasets of construction information, which must be handled with utmost care to prevent unauthorized access and breaches. Ensuring data anonymization and implementing robust security measures are essential steps to address this issue.""",
)
test_case3 = LLMTestCase(
input="What role does AI play in personalized entertainment, and what are its potential benefits?",
actual_output="""AI plays a pivotal role in advancing personalized medicine by enabling the analysis of large-scale datasets to uncover individual variations in patients' genetic profiles, lifestyle, and environmental factors. By integrating this diverse information, AI can identify unique biomarkers and predict responses to specific treatments, allowing for the development of personalized therapeutic strategies. One of the significant benefits of AI in personalized medicine is its ability to tailor treatments to an individual's genetic makeup, improving the efficacy of interventions and reducing adverse effects. For example, in oncology, AI algorithms can analyze tumor genomics to identify the most effective chemotherapy or targeted therapy for a particular patient, leading to better outcomes and fewer side effects. Additionally, AI-driven personalized medicine can facilitate early disease detection and prevention by identifying high-risk individuals and recommending proactive measures based on their unique health profiles. This approach not only enhances patient care but also has the potential to optimize healthcare resources by focusing on prevention and targeted treatments.""",
)
test_case4 = LLMTestCase(
input="How can AI assist in streamlining repeatitive tasks in the automotive industry?",
actual_output="""AI can significantly streamline administrative tasks in healthcare settings, reducing the burden on healthcare professionals and improving overall operational efficiency. One of the primary applications of AI in this domain is automating routine and repetitive tasks such as appointment scheduling, billing, and claims processing. AI-powered chatbots and virtual assistants can handle patient inquiries, schedule appointments, and send reminders, thereby freeing up staff time to focus on more complex tasks. Moreover, AI can enhance the accuracy and speed of medical billing and coding by extracting relevant information from clinical documents and ensuring compliance with coding standards, which reduces errors and accelerates the reimbursement process. AI can also aid in managing electronic health records (EHRs) by organizing and analyzing patient data, identifying patterns, and generating insights that support clinical decision-making. By automating these administrative processes, AI not only improves efficiency but also enhances patient satisfaction by reducing wait times and ensuring timely access to care.""",
)

# Define the metric
metric1 = AnswerRelevancyMetric(verbose_mode=False)
metric1 = AnswerRelevancyMetric(verbose_mode=True)
metric2 = AnswerRelevancyMetric(verbose_mode=True)

metric1.measure(test_case1)
metric2.measure(test_case2)
# metric1.measure(test_case1)
# metric2.measure(test_case2)


# Asynchronous function to measure relevancy concurrently
async def measure_relevancy():
await asyncio.gather(
metric1.a_measure(test_case1, _show_indicator=False),
metric1.a_measure(test_case2, _show_indicator=False),
metric1.a_measure(test_case3, _show_indicator=False),
metric1.a_measure(test_case4, _show_indicator=False),
)
# await metric1.a_measure(test_case1, _show_indicator=False)
print(metric1.statements)
print("All measurements are done.")

# # Asynchronous function to measure relevancy concurrently
# async def measure_relevancy():
# await metric1.a_measure(test_case1, _show_indicator=False)
# print(metric1.statements)
# print("All measurements are done.")

# # Run the asynchronous function and print after completion
# asyncio.run(measure_relevancy())
asyncio.run(measure_relevancy())
# print("This is printed after all asynchronous operations are complete.")


print(metric1.statements)
print(metric2.statements)
# print(metric1.statements)
# print(metric2.statements)
32 changes: 16 additions & 16 deletions deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class TestResult:
"""Returned from run_test"""

success: bool
metrics_metadatas: List[MetricMetadata]
metrics_metadata: List[MetricMetadata]
input: str
actual_output: str
expected_output: str
Expand Down Expand Up @@ -88,7 +88,7 @@ def create_test_result(

return TestResult(
success=tc.success,
metrics_metadatas=tc.metrics_metadata,
metrics_metadata=tc.metrics_metadata,
input=tc.input,
actual_output=tc.actual_output,
expected_output=tc.expected_output,
Expand Down Expand Up @@ -396,17 +396,17 @@ def assert_test(
failed_metrics_metadata: List[MetricMetadata] = []
# even for conversations, test_result right now is just the
# result for the last message
for metrics_metadata in test_result.metrics_metadatas:
if metrics_metadata.error is not None:
failed_metrics_metadata.append(metrics_metadata)
for metric_metadata in test_result.metrics_metadata:
if metric_metadata.error is not None:
failed_metrics_metadata.append(metric_metadata)
else:
# This try block is for user defined custom metrics,
# which might not handle the score == undefined case elegantly
try:
if not metrics_metadata.success:
failed_metrics_metadata.append(metrics_metadata)
if not metric_metadata.success:
failed_metrics_metadata.append(metric_metadata)
except:
failed_metrics_metadata.append(metrics_metadata)
failed_metrics_metadata.append(metric_metadata)

failed_metrics_str = ", ".join(
[
Expand Down Expand Up @@ -491,26 +491,26 @@ def print_test_result(test_result: TestResult):
print("")
print("=" * 70 + "\n")
print("Metrics Summary\n")
for metrics_metadata in test_result.metrics_metadatas:
for metric_metadata in test_result.metrics_metadata:
successful = True
if metrics_metadata.error is not None:
if metric_metadata.error is not None:
successful = False
else:
# This try block is for user defined custom metrics,
# which might not handle the score == undefined case elegantly
try:
if not metrics_metadata.success:
if not metric_metadata.success:
successful = False
except:
successful = False

if not successful:
print(
f" - ❌ {metrics_metadata.metric} (score: {metrics_metadata.score}, threshold: {metrics_metadata.threshold}, strict: {metrics_metadata.strict_mode}, evaluation model: {metrics_metadata.evaluation_model}, reason: {metrics_metadata.reason}, error: {metrics_metadata.error})"
f" - ❌ {metric_metadata.metric} (score: {metric_metadata.score}, threshold: {metric_metadata.threshold}, strict: {metric_metadata.strict_mode}, evaluation model: {metric_metadata.evaluation_model}, reason: {metric_metadata.reason}, error: {metric_metadata.error})"
)
else:
print(
f" - ✅ {metrics_metadata.metric} (score: {metrics_metadata.score}, threshold: {metrics_metadata.threshold}, strict: {metrics_metadata.strict_mode}, evaluation model: {metrics_metadata.evaluation_model}, reason: {metrics_metadata.reason}, error: {metrics_metadata.error})"
f" - ✅ {metric_metadata.metric} (score: {metric_metadata.score}, threshold: {metric_metadata.threshold}, strict: {metric_metadata.strict_mode}, evaluation model: {metric_metadata.evaluation_model}, reason: {metric_metadata.reason}, error: {metric_metadata.error})"
)
# if metrics_metadata.score_breakdown:
# for metric_name, score in metrics_metadata.score_breakdown.items():
Expand All @@ -530,13 +530,13 @@ def aggregate_metric_pass_rates(test_results: List[TestResult]) -> dict:
metric_successes = {}

for result in test_results:
for metrics_metadata in result.metrics_metadatas:
metric_name = metrics_metadata.metric
for metric_metadata in result.metrics_metadata:
metric_name = metric_metadata.metric
if metric_name not in metric_counts:
metric_counts[metric_name] = 0
metric_successes[metric_name] = 0
metric_counts[metric_name] += 1
if metrics_metadata.success:
if metric_metadata.success:
metric_successes[metric_name] += 1

metric_pass_rates = {
Expand Down
16 changes: 11 additions & 5 deletions deepeval/metrics/answer_relevancy/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from typing import Optional, List, Union
from pydantic import BaseModel, Field

from deepeval.utils import get_or_create_event_loop, generate_uuid
from deepeval.utils import (
get_or_create_event_loop,
generate_uuid,
prettify_list,
)
from deepeval.metrics.utils import (
validate_conversational_test_case,
trimAndLoadJson,
Expand Down Expand Up @@ -103,8 +107,9 @@ def measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Statements:\n{self.statements}\n",
f"Verdicts:\n{self.verdicts}",
f"Statements:\n{prettify_list(self.statements)}\n",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down Expand Up @@ -135,8 +140,9 @@ async def a_measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Statements:\n{self.statements}\n",
f"Verdicts:\n{self.verdicts}",
f"Statements:\n{prettify_list(self.statements)}\n",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down
16 changes: 11 additions & 5 deletions deepeval/metrics/bias/bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
)
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.models import DeepEvalBaseLLM
from deepeval.utils import get_or_create_event_loop, generate_uuid
from deepeval.utils import (
get_or_create_event_loop,
generate_uuid,
prettify_list,
)
from deepeval.metrics.utils import (
print_intermediate_steps,
validate_conversational_test_case,
Expand Down Expand Up @@ -104,8 +108,9 @@ def measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Opinions:\n{self.opinions}\n",
f"Verdicts:\n{self.verdicts}",
f"Opinions:\n{prettify_list(self.opinions)}\n",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down Expand Up @@ -136,8 +141,9 @@ async def a_measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Opinions:\n{self.opinions}\n",
f"Verdicts:\n{self.verdicts}",
f"Opinions:\n{prettify_list(self.opinions)}\n",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down
12 changes: 9 additions & 3 deletions deepeval/metrics/contextual_precision/contextual_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from typing import Optional, List, Union
from pydantic import BaseModel

from deepeval.utils import get_or_create_event_loop, generate_uuid
from deepeval.utils import (
get_or_create_event_loop,
generate_uuid,
prettify_list,
)
from deepeval.metrics.utils import (
print_intermediate_steps,
validate_conversational_test_case,
Expand Down Expand Up @@ -95,7 +99,8 @@ def measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Verdicts:\n{self.verdicts}",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down Expand Up @@ -129,7 +134,8 @@ async def a_measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Verdicts:\n{self.verdicts}",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down
12 changes: 9 additions & 3 deletions deepeval/metrics/contextual_recall/contextual_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from typing import Optional, List, Union
from pydantic import BaseModel, Field

from deepeval.utils import get_or_create_event_loop, generate_uuid
from deepeval.utils import (
get_or_create_event_loop,
generate_uuid,
prettify_list,
)
from deepeval.metrics.utils import (
print_intermediate_steps,
validate_conversational_test_case,
Expand Down Expand Up @@ -90,7 +94,8 @@ def measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Verdicts:\n{self.verdicts}",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down Expand Up @@ -122,7 +127,8 @@ async def a_measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Verdicts:\n{self.verdicts}",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down
12 changes: 9 additions & 3 deletions deepeval/metrics/contextual_relevancy/contextual_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from pydantic import BaseModel, Field
import asyncio

from deepeval.utils import get_or_create_event_loop, generate_uuid
from deepeval.utils import (
get_or_create_event_loop,
generate_uuid,
prettify_list,
)
from deepeval.metrics.utils import (
print_intermediate_steps,
validate_conversational_test_case,
Expand Down Expand Up @@ -92,7 +96,8 @@ def measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Verdicts:\n{self.verdicts}",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down Expand Up @@ -124,7 +129,8 @@ async def a_measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Verdicts:\n{self.verdicts}",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down
20 changes: 13 additions & 7 deletions deepeval/metrics/faithfulness/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
ConversationalTestCase,
)
from deepeval.metrics import BaseMetric
from deepeval.utils import get_or_create_event_loop, generate_uuid
from deepeval.utils import (
get_or_create_event_loop,
generate_uuid,
prettify_list,
)
from deepeval.metrics.utils import (
print_intermediate_steps,
validate_conversational_test_case,
Expand Down Expand Up @@ -122,9 +126,10 @@ def measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Truths:\n{self.truths}",
f"Claims:\n{self.claims}",
f"Verdicts:\n{self.verdicts}",
f"Truths:\n{prettify_list(self.truths)}\n",
f"Claims:\n{prettify_list(self.claims)}\n",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down Expand Up @@ -156,9 +161,10 @@ async def a_measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Truths:\n{self.truths}",
f"Claims:\n{self.claims}",
f"Verdicts:\n{self.verdicts}",
f"Truths:\n{prettify_list(self.truths)}\n",
f"Claims:\n{prettify_list(self.claims)}\n",
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down
12 changes: 9 additions & 3 deletions deepeval/metrics/g_eval/g_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
ConversationalTestCase,
)
from deepeval.metrics.g_eval.template import GEvalTemplate
from deepeval.utils import get_or_create_event_loop, generate_uuid
from deepeval.utils import (
get_or_create_event_loop,
generate_uuid,
prettify_list,
)
from deepeval.metrics.utils import (
print_intermediate_steps,
validate_conversational_test_case,
Expand Down Expand Up @@ -142,7 +146,8 @@ def measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Evaluation Steps:\n{self.evaluation_steps}",
f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down Expand Up @@ -178,7 +183,8 @@ async def a_measure(
print_intermediate_steps(
self.__name__,
steps=[
f"Evaluation Steps:\n{self.evaluation_steps}",
f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}\n",
f"Score: {self.score}\nReason: {self.reason}",
],
)
return self.score
Expand Down
Loading
Loading