Skip to content

Commit 1b0c316

Browse files
authored
Merge pull request #830 from confident-ai/features/stateless-cleanup
Features/stateless cleanup
2 parents 700a1b8 + aafc017 commit 1b0c316

File tree

21 files changed

+851
-715
lines changed

21 files changed

+851
-715
lines changed

a.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import asyncio
2+
from deepeval import evaluate
3+
from deepeval.metrics import AnswerRelevancyMetric
4+
from deepeval.test_case import LLMTestCase
5+
6+
# Define the test cases
7+
test_case1 = LLMTestCase(
8+
input="The chicken crossed the road, why?",
9+
actual_output="Because he felt like it",
10+
)
11+
test_case2 = LLMTestCase(
12+
input="Knock knock, who's there?", actual_output="The chicken"
13+
)
14+
15+
# Define the metric
16+
metric1 = AnswerRelevancyMetric(verbose_mode=False)
17+
metric2 = AnswerRelevancyMetric(verbose_mode=True)
18+
19+
metric1.measure(test_case1)
20+
metric2.measure(test_case2)
21+
22+
# # Asynchronous function to measure relevancy concurrently
23+
# async def measure_relevancy():
24+
# await metric1.a_measure(test_case1, _show_indicator=False)
25+
# print(metric1.statements)
26+
# print("All measurements are done.")
27+
28+
# # Run the asynchronous function and print after completion
29+
# asyncio.run(measure_relevancy())
30+
# print("This is printed after all asynchronous operations are complete.")
31+
32+
33+
print(metric1.statements)
34+
print(metric2.statements)

deepeval/evaluate.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,12 @@
2323
ConversationalApiTestCase,
2424
MetricMetadata,
2525
)
26-
from deepeval.utils import get_is_running_deepeval, set_indicator, capture_contextvars, update_contextvars
26+
from deepeval.utils import (
27+
get_is_running_deepeval,
28+
set_indicator,
29+
capture_contextvars,
30+
update_contextvars,
31+
)
2732
from deepeval.test_run.cache import (
2833
test_run_cache_manager,
2934
Cache,
@@ -300,7 +305,7 @@ async def a_execute_test_cases(
300305
await measure_metrics_with_indicator(
301306
metrics, test_case, cached_test_case, ignore_errors
302307
)
303-
308+
304309
for metric in metrics:
305310
metric_metadata = create_metric_metadata(metric)
306311
if isinstance(test_case, ConversationalTestCase):

deepeval/metrics/answer_relevancy/answer_relevancy.py

Lines changed: 67 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
from typing import Optional, List, Union
33
from pydantic import BaseModel, Field
44

5-
from deepeval.utils import get_or_create_event_loop
5+
from deepeval.utils import get_or_create_event_loop, generate_uuid
66
from deepeval.metrics.utils import (
77
validate_conversational_test_case,
88
trimAndLoadJson,
99
check_llm_test_case_params,
1010
initialize_model,
11+
print_intermediate_steps,
1112
)
1213
from deepeval.test_case import (
1314
LLMTestCase,
@@ -31,6 +32,21 @@ class AnswerRelvancyVerdict(BaseModel):
3132

3233

3334
class AnswerRelevancyMetric(BaseMetric):
35+
@property
36+
def statements(self) -> Optional[List[str]]:
37+
return self._statements.get()
38+
39+
@statements.setter
40+
def statements(self, value: Optional[List[str]]):
41+
self._statements.set(value)
42+
43+
@property
44+
def verdicts(self) -> Optional[List[AnswerRelvancyVerdict]]:
45+
return self._verdicts.get()
46+
47+
@verdicts.setter
48+
def verdicts(self, value: Optional[List[AnswerRelvancyVerdict]]):
49+
self._verdicts.set(value)
3450

3551
def __init__(
3652
self,
@@ -39,35 +55,24 @@ def __init__(
3955
include_reason: bool = True,
4056
async_mode: bool = True,
4157
strict_mode: bool = False,
58+
verbose_mode: bool = False,
4259
):
43-
super().__init__()
44-
self._statements: ContextVar[Optional[List[str]]] = ContextVar(f'{self.__class__.__name__}_statements', default=None)
45-
self._verdicts: ContextVar[Optional[List[AnswerRelvancyVerdict]]] = ContextVar(f'{self.__class__.__name__}_verdicts', default=None)
60+
self._statements: ContextVar[Optional[List[str]]] = ContextVar(
61+
generate_uuid(), default=None
62+
)
63+
self._verdicts: ContextVar[Optional[List[AnswerRelvancyVerdict]]] = (
64+
ContextVar(generate_uuid(), default=None)
65+
)
4666
self.threshold = 1 if strict_mode else threshold
4767
self.model, self.using_native_model = initialize_model(model)
4868
self.evaluation_model = self.model.get_model_name()
4969
self.include_reason = include_reason
5070
self.async_mode = async_mode
5171
self.strict_mode = strict_mode
52-
53-
@property
54-
def statements(self) -> Optional[List[str]]:
55-
return self._statements.get()
56-
@statements.setter
57-
def statements(self, value: Optional[List[str]]):
58-
self._statements.set(value)
59-
60-
@property
61-
def verdicts(self) -> Optional[List[AnswerRelvancyVerdict]]:
62-
return self._verdicts.get()
63-
@verdicts.setter
64-
def verdicts(self, value: Optional[List[AnswerRelvancyVerdict]]):
65-
self._verdicts.set(value)
72+
self.verbose_mode = verbose_mode
6673

6774
def measure(
68-
self,
69-
test_case: Union[LLMTestCase, ConversationalTestCase],
70-
verbose: bool = True,
75+
self, test_case: Union[LLMTestCase, ConversationalTestCase]
7176
) -> float:
7277
if isinstance(test_case, ConversationalTestCase):
7378
test_case = validate_conversational_test_case(test_case, self)
@@ -78,44 +83,36 @@ def measure(
7883
if self.async_mode:
7984
loop = get_or_create_event_loop()
8085
(
81-
self.statements,
82-
self.verdicts,
83-
self.score,
84-
self.reason,
85-
self.success
86-
) = loop.run_until_complete(
87-
self._measure_async(test_case, verbose)
88-
)
86+
self.statements,
87+
self.verdicts,
88+
self.score,
89+
self.reason,
90+
self.success,
91+
) = loop.run_until_complete(self._measure_async(test_case))
8992
else:
90-
self.statements = self._generate_statements(
93+
self.statements: List[str] = self._generate_statements(
9194
test_case.actual_output
9295
)
93-
self.verdicts = self._generate_verdicts(test_case.input)
96+
self.verdicts: List[AnswerRelvancyVerdict] = (
97+
self._generate_verdicts(test_case.input)
98+
)
9499
self.score = self._calculate_score()
95100
self.reason = self._generate_reason(test_case.input)
96101
self.success = self.score >= self.threshold
97-
if verbose:
98-
print(f"statements: {self.statements}\nverdicts: {self.verdicts}\n")
102+
if self.verbose_mode:
103+
print_intermediate_steps(
104+
self.__name__,
105+
steps=[
106+
f"Statements:\n{self.statements}\n",
107+
f"Verdicts:\n{self.verdicts}",
108+
],
109+
)
99110
return self.score
100-
101-
async def _measure_async(
102-
self,
103-
test_case: Union[LLMTestCase, ConversationalTestCase],
104-
verbose: bool):
105-
await self.a_measure(test_case, _show_indicator=False, verbose=verbose)
106-
return (
107-
self.statements,
108-
self.verdicts,
109-
self.score,
110-
self.reason,
111-
self.success
112-
)
113111

114112
async def a_measure(
115113
self,
116114
test_case: Union[LLMTestCase, ConversationalTestCase],
117115
_show_indicator: bool = True,
118-
verbose: bool = True
119116
) -> float:
120117
if isinstance(test_case, ConversationalTestCase):
121118
test_case = validate_conversational_test_case(test_case, self)
@@ -125,19 +122,37 @@ async def a_measure(
125122
with metric_progress_indicator(
126123
self, async_mode=True, _show_indicator=_show_indicator
127124
):
128-
self.statements = await self._a_generate_statements(
125+
self.statements: List[str] = await self._a_generate_statements(
129126
test_case.actual_output
130127
)
131-
self.verdicts = (
128+
self.verdicts: List[AnswerRelvancyVerdict] = (
132129
await self._a_generate_verdicts(test_case.input)
133130
)
134131
self.score = self._calculate_score()
135132
self.reason = await self._a_generate_reason(test_case.input)
136133
self.success = self.score >= self.threshold
137-
if verbose:
138-
print(f"statements: {self.statements}\nverdicts: {self.verdicts}\nscore: {self.score}, success: {self.success}\n")
134+
if self.verbose_mode:
135+
print_intermediate_steps(
136+
self.__name__,
137+
steps=[
138+
f"Statements:\n{self.statements}\n",
139+
f"Verdicts:\n{self.verdicts}",
140+
],
141+
)
139142
return self.score
140143

144+
async def _measure_async(
145+
self, test_case: Union[LLMTestCase, ConversationalTestCase]
146+
):
147+
await self.a_measure(test_case, _show_indicator=False)
148+
return (
149+
self.statements,
150+
self.verdicts,
151+
self.score,
152+
self.reason,
153+
self.success,
154+
)
155+
141156
async def _a_generate_reason(self, input: str) -> str:
142157
if self.include_reason is False:
143158
return None
@@ -275,4 +290,4 @@ def is_successful(self) -> bool:
275290

276291
@property
277292
def __name__(self):
278-
return "Answer Relevancy"
293+
return "Answer Relevancy"

0 commit comments

Comments
 (0)