Skip to content

Commit f862800

Browse files
committed
.
1 parent 6816454 commit f862800

File tree

2 files changed

+17
-63
lines changed

2 files changed

+17
-63
lines changed

deepeval/evaluate.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class TestResult:
4343
"""Returned from run_test"""
4444

4545
success: bool
46-
metrics_metadatas: List[MetricMetadata]
46+
metrics_metadata: List[MetricMetadata]
4747
input: str
4848
actual_output: str
4949
expected_output: str
@@ -88,7 +88,7 @@ def create_test_result(
8888

8989
return TestResult(
9090
success=tc.success,
91-
metrics_metadatas=tc.metrics_metadata,
91+
metrics_metadata=tc.metrics_metadata,
9292
input=tc.input,
9393
actual_output=tc.actual_output,
9494
expected_output=tc.expected_output,
@@ -396,17 +396,17 @@ def assert_test(
396396
failed_metrics_metadata: List[MetricMetadata] = []
397397
# even for conversations, test_result right now is just the
398398
# result for the last message
399-
for metrics_metadata in test_result.metrics_metadatas:
400-
if metrics_metadata.error is not None:
401-
failed_metrics_metadata.append(metrics_metadata)
399+
for metric_metadata in test_result.metrics_metadata:
400+
if metric_metadata.error is not None:
401+
failed_metrics_metadata.append(metric_metadata)
402402
else:
403403
# This try block is for user defined custom metrics,
404404
# which might not handle the score == undefined case elegantly
405405
try:
406-
if not metrics_metadata.success:
407-
failed_metrics_metadata.append(metrics_metadata)
406+
if not metric_metadata.success:
407+
failed_metrics_metadata.append(metric_metadata)
408408
except:
409-
failed_metrics_metadata.append(metrics_metadata)
409+
failed_metrics_metadata.append(metric_metadata)
410410

411411
failed_metrics_str = ", ".join(
412412
[
@@ -491,26 +491,26 @@ def print_test_result(test_result: TestResult):
491491
print("")
492492
print("=" * 70 + "\n")
493493
print("Metrics Summary\n")
494-
for metrics_metadata in test_result.metrics_metadatas:
494+
for metric_metadata in test_result.metrics_metadata:
495495
successful = True
496-
if metrics_metadata.error is not None:
496+
if metric_metadata.error is not None:
497497
successful = False
498498
else:
499499
# This try block is for user defined custom metrics,
500500
# which might not handle the score == undefined case elegantly
501501
try:
502-
if not metrics_metadata.success:
502+
if not metric_metadata.success:
503503
successful = False
504504
except:
505505
successful = False
506506

507507
if not successful:
508508
print(
509-
f" - ❌ {metrics_metadata.metric} (score: {metrics_metadata.score}, threshold: {metrics_metadata.threshold}, strict: {metrics_metadata.strict_mode}, evaluation model: {metrics_metadata.evaluation_model}, reason: {metrics_metadata.reason}, error: {metrics_metadata.error})"
509+
f" - ❌ {metric_metadata.metric} (score: {metric_metadata.score}, threshold: {metric_metadata.threshold}, strict: {metric_metadata.strict_mode}, evaluation model: {metric_metadata.evaluation_model}, reason: {metric_metadata.reason}, error: {metric_metadata.error})"
510510
)
511511
else:
512512
print(
513-
f" - ✅ {metrics_metadata.metric} (score: {metrics_metadata.score}, threshold: {metrics_metadata.threshold}, strict: {metrics_metadata.strict_mode}, evaluation model: {metrics_metadata.evaluation_model}, reason: {metrics_metadata.reason}, error: {metrics_metadata.error})"
513+
f" - ✅ {metric_metadata.metric} (score: {metric_metadata.score}, threshold: {metric_metadata.threshold}, strict: {metric_metadata.strict_mode}, evaluation model: {metric_metadata.evaluation_model}, reason: {metric_metadata.reason}, error: {metric_metadata.error})"
514514
)
515515
# if metrics_metadata.score_breakdown:
516516
# for metric_name, score in metrics_metadata.score_breakdown.items():
@@ -530,13 +530,13 @@ def aggregate_metric_pass_rates(test_results: List[TestResult]) -> dict:
530530
metric_successes = {}
531531

532532
for result in test_results:
533-
for metrics_metadata in result.metrics_metadatas:
534-
metric_name = metrics_metadata.metric
533+
for metric_metadata in result.metrics_metadata:
534+
metric_name = metric_metadata.metric
535535
if metric_name not in metric_counts:
536536
metric_counts[metric_name] = 0
537537
metric_successes[metric_name] = 0
538538
metric_counts[metric_name] += 1
539-
if metrics_metadata.success:
539+
if metric_metadata.success:
540540
metric_successes[metric_name] += 1
541541

542542
metric_pass_rates = {

tests/test_dataset.py

Lines changed: 1 addition & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from deepeval.test_case import LLMTestCase
99
from deepeval.evaluate import aggregate_metric_pass_rates, TestResult
1010
from deepeval.metrics import AnswerRelevancyMetric, BiasMetric
11+
from deepeval.test_run.api import MetricMetadata
1112

1213

1314
class FakeMetric1(BaseMetric):
@@ -80,50 +81,3 @@ def test_create_dataset():
8081
actual_output_key_name="actual_output",
8182
)
8283
assert len(dataset.test_cases) == 10, "Test Cases not loaded from JSON"
83-
84-
85-
def test_aggregate_metric_pass_rates():
86-
test_case = LLMTestCase(
87-
input="What is the primary difference between a comet and an asteroid?",
88-
actual_output="Comets and asteroids are both celestial bodies found in our solar system but differ in composition and behavior. Comets, made up of ice, dust, and small rocky particles, develop glowing comas and tails when near the Sun. In contrast, asteroids are primarily rocky or metallic and are mostly found in the asteroid belt between Mars and Jupiter. Rabbits are found in the wild.",
89-
expected_output="Comets and asteroids are both celestial bodies found in our solar system but differ in composition and behavior. Comets, made up of ice, dust, and small rocky particles, develop glowing comas and tails when near the Sun. In contrast, asteroids are primarily rocky or metallic and are mostly found in the asteroid belt between Mars and Jupiter. Rabbits are found in the wild.",
90-
)
91-
fake_metric_1_fail = FakeMetric1(_success=False)
92-
fake_metric_2_fail = FakeMetric2()
93-
fake_metric_1_fail.measure(test_case=test_case)
94-
fake_metric_2_fail.measure(test_case=test_case)
95-
96-
fake_metric_1_pass = FakeMetric1()
97-
fake_metric_2_pass = FakeMetric2()
98-
fake_metric_1_pass.measure(test_case=test_case)
99-
fake_metric_2_pass.measure(test_case=test_case)
100-
test_results = [
101-
TestResult(
102-
success=False,
103-
metrics=[
104-
fake_metric_1_fail,
105-
fake_metric_2_fail,
106-
],
107-
input="some input",
108-
actual_output="some output",
109-
expected_output="expected output",
110-
context=["context"],
111-
retrieval_context=["retrieval context"],
112-
),
113-
TestResult(
114-
success=True,
115-
metrics=[
116-
fake_metric_1_pass,
117-
fake_metric_2_pass,
118-
],
119-
input="another input",
120-
actual_output="another output",
121-
expected_output="another expected output",
122-
context=["another context"],
123-
retrieval_context=["another retrieval context"],
124-
),
125-
]
126-
127-
expected_result = {"FakeMetric1": 0.5, "FakeMetric2": 1.0}
128-
result = aggregate_metric_pass_rates(test_results)
129-
assert result == expected_result

0 commit comments

Comments
 (0)