Skip to content

Commit 739fa48

Browse files
authored
Merge pull request #832 from confident-ai/features/stateless-cleanup
Features/stateless cleanup
2 parents 1b0c316 + 582f23c commit 739fa48

File tree

17 files changed

+189
-134
lines changed

17 files changed

+189
-134
lines changed

a.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,30 +5,47 @@
55

66
# Define the test cases
77
test_case1 = LLMTestCase(
8-
input="The chicken crossed the road, why?",
9-
actual_output="Because he felt like it",
8+
input="How is Artificial Intelligence (AI) being used to improve diagnostic accuracy in healthcare?",
9+
actual_output="""AI is revolutionizing the field of education by leveraging machine learning algorithms and deep learning techniques to analyze complex textbook data.""",
1010
)
1111
test_case2 = LLMTestCase(
12-
input="Knock knock, who's there?", actual_output="The chicken"
12+
input="What ethical considerations arise with the implementation of AI in healthcare, and how can they be addressed?",
13+
actual_output="""The integration of AI in construction brings forth several ethical considerations, including data privacy, algorithmic bias, and the transparency of AI decision-making processes. Data privacy is a major concern, as AI systems often require access to large datasets of construction information, which must be handled with utmost care to prevent unauthorized access and breaches. Ensuring data anonymization and implementing robust security measures are essential steps to address this issue.""",
14+
)
15+
test_case3 = LLMTestCase(
16+
input="What role does AI play in personalized entertainment, and what are its potential benefits?",
17+
actual_output="""AI plays a pivotal role in advancing personalized medicine by enabling the analysis of large-scale datasets to uncover individual variations in patients' genetic profiles, lifestyle, and environmental factors. By integrating this diverse information, AI can identify unique biomarkers and predict responses to specific treatments, allowing for the development of personalized therapeutic strategies. One of the significant benefits of AI in personalized medicine is its ability to tailor treatments to an individual's genetic makeup, improving the efficacy of interventions and reducing adverse effects. For example, in oncology, AI algorithms can analyze tumor genomics to identify the most effective chemotherapy or targeted therapy for a particular patient, leading to better outcomes and fewer side effects. Additionally, AI-driven personalized medicine can facilitate early disease detection and prevention by identifying high-risk individuals and recommending proactive measures based on their unique health profiles. This approach not only enhances patient care but also has the potential to optimize healthcare resources by focusing on prevention and targeted treatments.""",
18+
)
19+
test_case4 = LLMTestCase(
20+
input="How can AI assist in streamlining repeatitive tasks in the automotive industry?",
21+
actual_output="""AI can significantly streamline administrative tasks in healthcare settings, reducing the burden on healthcare professionals and improving overall operational efficiency. One of the primary applications of AI in this domain is automating routine and repetitive tasks such as appointment scheduling, billing, and claims processing. AI-powered chatbots and virtual assistants can handle patient inquiries, schedule appointments, and send reminders, thereby freeing up staff time to focus on more complex tasks. Moreover, AI can enhance the accuracy and speed of medical billing and coding by extracting relevant information from clinical documents and ensuring compliance with coding standards, which reduces errors and accelerates the reimbursement process. AI can also aid in managing electronic health records (EHRs) by organizing and analyzing patient data, identifying patterns, and generating insights that support clinical decision-making. By automating these administrative processes, AI not only improves efficiency but also enhances patient satisfaction by reducing wait times and ensuring timely access to care.""",
1322
)
1423

1524
# Define the metric
16-
metric1 = AnswerRelevancyMetric(verbose_mode=False)
25+
metric1 = AnswerRelevancyMetric(verbose_mode=True)
1726
metric2 = AnswerRelevancyMetric(verbose_mode=True)
1827

19-
metric1.measure(test_case1)
20-
metric2.measure(test_case2)
28+
# metric1.measure(test_case1)
29+
# metric2.measure(test_case2)
30+
31+
32+
# Asynchronous function to measure relevancy concurrently
33+
async def measure_relevancy():
34+
await asyncio.gather(
35+
metric1.a_measure(test_case1, _show_indicator=False),
36+
metric1.a_measure(test_case2, _show_indicator=False),
37+
metric1.a_measure(test_case3, _show_indicator=False),
38+
metric1.a_measure(test_case4, _show_indicator=False),
39+
)
40+
# await metric1.a_measure(test_case1, _show_indicator=False)
41+
print(metric1.statements)
42+
print("All measurements are done.")
2143

22-
# # Asynchronous function to measure relevancy concurrently
23-
# async def measure_relevancy():
24-
# await metric1.a_measure(test_case1, _show_indicator=False)
25-
# print(metric1.statements)
26-
# print("All measurements are done.")
2744

2845
# # Run the asynchronous function and print after completion
29-
# asyncio.run(measure_relevancy())
46+
asyncio.run(measure_relevancy())
3047
# print("This is printed after all asynchronous operations are complete.")
3148

3249

33-
print(metric1.statements)
34-
print(metric2.statements)
50+
# print(metric1.statements)
51+
# print(metric2.statements)

deepeval/evaluate.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class TestResult:
4343
"""Returned from run_test"""
4444

4545
success: bool
46-
metrics_metadatas: List[MetricMetadata]
46+
metrics_metadata: List[MetricMetadata]
4747
input: str
4848
actual_output: str
4949
expected_output: str
@@ -88,7 +88,7 @@ def create_test_result(
8888

8989
return TestResult(
9090
success=tc.success,
91-
metrics_metadatas=tc.metrics_metadata,
91+
metrics_metadata=tc.metrics_metadata,
9292
input=tc.input,
9393
actual_output=tc.actual_output,
9494
expected_output=tc.expected_output,
@@ -396,17 +396,17 @@ def assert_test(
396396
failed_metrics_metadata: List[MetricMetadata] = []
397397
# even for conversations, test_result right now is just the
398398
# result for the last message
399-
for metrics_metadata in test_result.metrics_metadatas:
400-
if metrics_metadata.error is not None:
401-
failed_metrics_metadata.append(metrics_metadata)
399+
for metric_metadata in test_result.metrics_metadata:
400+
if metric_metadata.error is not None:
401+
failed_metrics_metadata.append(metric_metadata)
402402
else:
403403
# This try block is for user defined custom metrics,
404404
# which might not handle the score == undefined case elegantly
405405
try:
406-
if not metrics_metadata.success:
407-
failed_metrics_metadata.append(metrics_metadata)
406+
if not metric_metadata.success:
407+
failed_metrics_metadata.append(metric_metadata)
408408
except:
409-
failed_metrics_metadata.append(metrics_metadata)
409+
failed_metrics_metadata.append(metric_metadata)
410410

411411
failed_metrics_str = ", ".join(
412412
[
@@ -491,26 +491,26 @@ def print_test_result(test_result: TestResult):
491491
print("")
492492
print("=" * 70 + "\n")
493493
print("Metrics Summary\n")
494-
for metrics_metadata in test_result.metrics_metadatas:
494+
for metric_metadata in test_result.metrics_metadata:
495495
successful = True
496-
if metrics_metadata.error is not None:
496+
if metric_metadata.error is not None:
497497
successful = False
498498
else:
499499
# This try block is for user defined custom metrics,
500500
# which might not handle the score == undefined case elegantly
501501
try:
502-
if not metrics_metadata.success:
502+
if not metric_metadata.success:
503503
successful = False
504504
except:
505505
successful = False
506506

507507
if not successful:
508508
print(
509-
f" - ❌ {metrics_metadata.metric} (score: {metrics_metadata.score}, threshold: {metrics_metadata.threshold}, strict: {metrics_metadata.strict_mode}, evaluation model: {metrics_metadata.evaluation_model}, reason: {metrics_metadata.reason}, error: {metrics_metadata.error})"
509+
f" - ❌ {metric_metadata.metric} (score: {metric_metadata.score}, threshold: {metric_metadata.threshold}, strict: {metric_metadata.strict_mode}, evaluation model: {metric_metadata.evaluation_model}, reason: {metric_metadata.reason}, error: {metric_metadata.error})"
510510
)
511511
else:
512512
print(
513-
f" - ✅ {metrics_metadata.metric} (score: {metrics_metadata.score}, threshold: {metrics_metadata.threshold}, strict: {metrics_metadata.strict_mode}, evaluation model: {metrics_metadata.evaluation_model}, reason: {metrics_metadata.reason}, error: {metrics_metadata.error})"
513+
f" - ✅ {metric_metadata.metric} (score: {metric_metadata.score}, threshold: {metric_metadata.threshold}, strict: {metric_metadata.strict_mode}, evaluation model: {metric_metadata.evaluation_model}, reason: {metric_metadata.reason}, error: {metric_metadata.error})"
514514
)
515515
# if metrics_metadata.score_breakdown:
516516
# for metric_name, score in metrics_metadata.score_breakdown.items():
@@ -530,13 +530,13 @@ def aggregate_metric_pass_rates(test_results: List[TestResult]) -> dict:
530530
metric_successes = {}
531531

532532
for result in test_results:
533-
for metrics_metadata in result.metrics_metadatas:
534-
metric_name = metrics_metadata.metric
533+
for metric_metadata in result.metrics_metadata:
534+
metric_name = metric_metadata.metric
535535
if metric_name not in metric_counts:
536536
metric_counts[metric_name] = 0
537537
metric_successes[metric_name] = 0
538538
metric_counts[metric_name] += 1
539-
if metrics_metadata.success:
539+
if metric_metadata.success:
540540
metric_successes[metric_name] += 1
541541

542542
metric_pass_rates = {

deepeval/metrics/answer_relevancy/answer_relevancy.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
from typing import Optional, List, Union
33
from pydantic import BaseModel, Field
44

5-
from deepeval.utils import get_or_create_event_loop, generate_uuid
5+
from deepeval.utils import (
6+
get_or_create_event_loop,
7+
generate_uuid,
8+
prettify_list,
9+
)
610
from deepeval.metrics.utils import (
711
validate_conversational_test_case,
812
trimAndLoadJson,
@@ -103,8 +107,9 @@ def measure(
103107
print_intermediate_steps(
104108
self.__name__,
105109
steps=[
106-
f"Statements:\n{self.statements}\n",
107-
f"Verdicts:\n{self.verdicts}",
110+
f"Statements:\n{prettify_list(self.statements)}\n",
111+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
112+
f"Score: {self.score}\nReason: {self.reason}",
108113
],
109114
)
110115
return self.score
@@ -135,8 +140,9 @@ async def a_measure(
135140
print_intermediate_steps(
136141
self.__name__,
137142
steps=[
138-
f"Statements:\n{self.statements}\n",
139-
f"Verdicts:\n{self.verdicts}",
143+
f"Statements:\n{prettify_list(self.statements)}\n",
144+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
145+
f"Score: {self.score}\nReason: {self.reason}",
140146
],
141147
)
142148
return self.score

deepeval/metrics/bias/bias.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010
)
1111
from deepeval.metrics.indicator import metric_progress_indicator
1212
from deepeval.models import DeepEvalBaseLLM
13-
from deepeval.utils import get_or_create_event_loop, generate_uuid
13+
from deepeval.utils import (
14+
get_or_create_event_loop,
15+
generate_uuid,
16+
prettify_list,
17+
)
1418
from deepeval.metrics.utils import (
1519
print_intermediate_steps,
1620
validate_conversational_test_case,
@@ -104,8 +108,9 @@ def measure(
104108
print_intermediate_steps(
105109
self.__name__,
106110
steps=[
107-
f"Opinions:\n{self.opinions}\n",
108-
f"Verdicts:\n{self.verdicts}",
111+
f"Opinions:\n{prettify_list(self.opinions)}\n",
112+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
113+
f"Score: {self.score}\nReason: {self.reason}",
109114
],
110115
)
111116
return self.score
@@ -136,8 +141,9 @@ async def a_measure(
136141
print_intermediate_steps(
137142
self.__name__,
138143
steps=[
139-
f"Opinions:\n{self.opinions}\n",
140-
f"Verdicts:\n{self.verdicts}",
144+
f"Opinions:\n{prettify_list(self.opinions)}\n",
145+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
146+
f"Score: {self.score}\nReason: {self.reason}",
141147
],
142148
)
143149
return self.score

deepeval/metrics/contextual_precision/contextual_precision.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
from typing import Optional, List, Union
33
from pydantic import BaseModel
44

5-
from deepeval.utils import get_or_create_event_loop, generate_uuid
5+
from deepeval.utils import (
6+
get_or_create_event_loop,
7+
generate_uuid,
8+
prettify_list,
9+
)
610
from deepeval.metrics.utils import (
711
print_intermediate_steps,
812
validate_conversational_test_case,
@@ -95,7 +99,8 @@ def measure(
9599
print_intermediate_steps(
96100
self.__name__,
97101
steps=[
98-
f"Verdicts:\n{self.verdicts}",
102+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
103+
f"Score: {self.score}\nReason: {self.reason}",
99104
],
100105
)
101106
return self.score
@@ -129,7 +134,8 @@ async def a_measure(
129134
print_intermediate_steps(
130135
self.__name__,
131136
steps=[
132-
f"Verdicts:\n{self.verdicts}",
137+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
138+
f"Score: {self.score}\nReason: {self.reason}",
133139
],
134140
)
135141
return self.score

deepeval/metrics/contextual_recall/contextual_recall.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
from typing import Optional, List, Union
33
from pydantic import BaseModel, Field
44

5-
from deepeval.utils import get_or_create_event_loop, generate_uuid
5+
from deepeval.utils import (
6+
get_or_create_event_loop,
7+
generate_uuid,
8+
prettify_list,
9+
)
610
from deepeval.metrics.utils import (
711
print_intermediate_steps,
812
validate_conversational_test_case,
@@ -90,7 +94,8 @@ def measure(
9094
print_intermediate_steps(
9195
self.__name__,
9296
steps=[
93-
f"Verdicts:\n{self.verdicts}",
97+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
98+
f"Score: {self.score}\nReason: {self.reason}",
9499
],
95100
)
96101
return self.score
@@ -122,7 +127,8 @@ async def a_measure(
122127
print_intermediate_steps(
123128
self.__name__,
124129
steps=[
125-
f"Verdicts:\n{self.verdicts}",
130+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
131+
f"Score: {self.score}\nReason: {self.reason}",
126132
],
127133
)
128134
return self.score

deepeval/metrics/contextual_relevancy/contextual_relevancy.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
from pydantic import BaseModel, Field
44
import asyncio
55

6-
from deepeval.utils import get_or_create_event_loop, generate_uuid
6+
from deepeval.utils import (
7+
get_or_create_event_loop,
8+
generate_uuid,
9+
prettify_list,
10+
)
711
from deepeval.metrics.utils import (
812
print_intermediate_steps,
913
validate_conversational_test_case,
@@ -92,7 +96,8 @@ def measure(
9296
print_intermediate_steps(
9397
self.__name__,
9498
steps=[
95-
f"Verdicts:\n{self.verdicts}",
99+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
100+
f"Score: {self.score}\nReason: {self.reason}",
96101
],
97102
)
98103
return self.score
@@ -124,7 +129,8 @@ async def a_measure(
124129
print_intermediate_steps(
125130
self.__name__,
126131
steps=[
127-
f"Verdicts:\n{self.verdicts}",
132+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
133+
f"Score: {self.score}\nReason: {self.reason}",
128134
],
129135
)
130136
return self.score

deepeval/metrics/faithfulness/faithfulness.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
ConversationalTestCase,
1010
)
1111
from deepeval.metrics import BaseMetric
12-
from deepeval.utils import get_or_create_event_loop, generate_uuid
12+
from deepeval.utils import (
13+
get_or_create_event_loop,
14+
generate_uuid,
15+
prettify_list,
16+
)
1317
from deepeval.metrics.utils import (
1418
print_intermediate_steps,
1519
validate_conversational_test_case,
@@ -122,9 +126,10 @@ def measure(
122126
print_intermediate_steps(
123127
self.__name__,
124128
steps=[
125-
f"Truths:\n{self.truths}",
126-
f"Claims:\n{self.claims}",
127-
f"Verdicts:\n{self.verdicts}",
129+
f"Truths:\n{prettify_list(self.truths)}\n",
130+
f"Claims:\n{prettify_list(self.claims)}\n",
131+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
132+
f"Score: {self.score}\nReason: {self.reason}",
128133
],
129134
)
130135
return self.score
@@ -156,9 +161,10 @@ async def a_measure(
156161
print_intermediate_steps(
157162
self.__name__,
158163
steps=[
159-
f"Truths:\n{self.truths}",
160-
f"Claims:\n{self.claims}",
161-
f"Verdicts:\n{self.verdicts}",
164+
f"Truths:\n{prettify_list(self.truths)}\n",
165+
f"Claims:\n{prettify_list(self.claims)}\n",
166+
f"Verdicts:\n{prettify_list(self.verdicts)}\n",
167+
f"Score: {self.score}\nReason: {self.reason}",
162168
],
163169
)
164170
return self.score

deepeval/metrics/g_eval/g_eval.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212
ConversationalTestCase,
1313
)
1414
from deepeval.metrics.g_eval.template import GEvalTemplate
15-
from deepeval.utils import get_or_create_event_loop, generate_uuid
15+
from deepeval.utils import (
16+
get_or_create_event_loop,
17+
generate_uuid,
18+
prettify_list,
19+
)
1620
from deepeval.metrics.utils import (
1721
print_intermediate_steps,
1822
validate_conversational_test_case,
@@ -142,7 +146,8 @@ def measure(
142146
print_intermediate_steps(
143147
self.__name__,
144148
steps=[
145-
f"Evaluation Steps:\n{self.evaluation_steps}",
149+
f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}\n",
150+
f"Score: {self.score}\nReason: {self.reason}",
146151
],
147152
)
148153
return self.score
@@ -178,7 +183,8 @@ async def a_measure(
178183
print_intermediate_steps(
179184
self.__name__,
180185
steps=[
181-
f"Evaluation Steps:\n{self.evaluation_steps}",
186+
f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}\n",
187+
f"Score: {self.score}\nReason: {self.reason}",
182188
],
183189
)
184190
return self.score

0 commit comments

Comments
 (0)