.

penguine-ip · penguine-ip · commit f8628001ce40 · 2024-06-18T01:02:49.000+08:00
diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py
@@ -43,7 +43,7 @@ class TestResult:
     """Returned from run_test"""
 
     success: bool
-    metrics_metadatas: List[MetricMetadata]
+    metrics_metadata: List[MetricMetadata]
     input: str
     actual_output: str
     expected_output: str
@@ -88,7 +88,7 @@ def create_test_result(
 
     return TestResult(
         success=tc.success,
-        metrics_metadatas=tc.metrics_metadata,
+        metrics_metadata=tc.metrics_metadata,
         input=tc.input,
         actual_output=tc.actual_output,
         expected_output=tc.expected_output,
@@ -396,17 +396,17 @@ def assert_test(
         failed_metrics_metadata: List[MetricMetadata] = []
         # even for conversations, test_result right now is just the
         # result for the last message
-        for metrics_metadata in test_result.metrics_metadatas:
-            if metrics_metadata.error is not None:
-                failed_metrics_metadata.append(metrics_metadata)
+        for metric_metadata in test_result.metrics_metadata:
+            if metric_metadata.error is not None:
+                failed_metrics_metadata.append(metric_metadata)
             else:
                 # This try block is for user defined custom metrics,
                 # which might not handle the score == undefined case elegantly
                 try:
-                    if not metrics_metadata.success:
-                        failed_metrics_metadata.append(metrics_metadata)
+                    if not metric_metadata.success:
+                        failed_metrics_metadata.append(metric_metadata)
                 except:
-                    failed_metrics_metadata.append(metrics_metadata)
+                    failed_metrics_metadata.append(metric_metadata)
 
         failed_metrics_str = ", ".join(
             [
@@ -491,26 +491,26 @@ def print_test_result(test_result: TestResult):
     print("")
     print("=" * 70 + "\n")
     print("Metrics Summary\n")
-    for metrics_metadata in test_result.metrics_metadatas:
+    for metric_metadata in test_result.metrics_metadata:
         successful = True
-        if metrics_metadata.error is not None:
+        if metric_metadata.error is not None:
             successful = False
         else:
             # This try block is for user defined custom metrics,
             # which might not handle the score == undefined case elegantly
             try:
-                if not metrics_metadata.success:
+                if not metric_metadata.success:
                     successful = False
             except:
                 successful = False
 
         if not successful:
             print(
-                f"  - ❌ {metrics_metadata.metric} (score: {metrics_metadata.score}, threshold: {metrics_metadata.threshold}, strict: {metrics_metadata.strict_mode}, evaluation model: {metrics_metadata.evaluation_model}, reason: {metrics_metadata.reason}, error: {metrics_metadata.error})"
+                f"  - ❌ {metric_metadata.metric} (score: {metric_metadata.score}, threshold: {metric_metadata.threshold}, strict: {metric_metadata.strict_mode}, evaluation model: {metric_metadata.evaluation_model}, reason: {metric_metadata.reason}, error: {metric_metadata.error})"
             )
         else:
             print(
-                f"  - ✅ {metrics_metadata.metric} (score: {metrics_metadata.score}, threshold: {metrics_metadata.threshold}, strict: {metrics_metadata.strict_mode}, evaluation model: {metrics_metadata.evaluation_model}, reason: {metrics_metadata.reason}, error: {metrics_metadata.error})"
+                f"  - ✅ {metric_metadata.metric} (score: {metric_metadata.score}, threshold: {metric_metadata.threshold}, strict: {metric_metadata.strict_mode}, evaluation model: {metric_metadata.evaluation_model}, reason: {metric_metadata.reason}, error: {metric_metadata.error})"
             )
         # if metrics_metadata.score_breakdown:
         #     for metric_name, score in metrics_metadata.score_breakdown.items():
@@ -530,13 +530,13 @@ def aggregate_metric_pass_rates(test_results: List[TestResult]) -> dict:
     metric_successes = {}
 
     for result in test_results:
-        for metrics_metadata in result.metrics_metadatas:
-            metric_name = metrics_metadata.metric
+        for metric_metadata in result.metrics_metadata:
+            metric_name = metric_metadata.metric
             if metric_name not in metric_counts:
                 metric_counts[metric_name] = 0
                 metric_successes[metric_name] = 0
             metric_counts[metric_name] += 1
-            if metrics_metadata.success:
+            if metric_metadata.success:
                 metric_successes[metric_name] += 1
 
     metric_pass_rates = {
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -8,6 +8,7 @@
 from deepeval.test_case import LLMTestCase
 from deepeval.evaluate import aggregate_metric_pass_rates, TestResult
 from deepeval.metrics import AnswerRelevancyMetric, BiasMetric
+from deepeval.test_run.api import MetricMetadata
 
 
 class FakeMetric1(BaseMetric):
@@ -80,50 +81,3 @@ def test_create_dataset():
         actual_output_key_name="actual_output",
     )
     assert len(dataset.test_cases) == 10, "Test Cases not loaded from JSON"
-
-
-def test_aggregate_metric_pass_rates():
-    test_case = LLMTestCase(
-        input="What is the primary difference between a comet and an asteroid?",
-        actual_output="Comets and asteroids are both celestial bodies found in our solar system but differ in composition and behavior. Comets, made up of ice, dust, and small rocky particles, develop glowing comas and tails when near the Sun. In contrast, asteroids are primarily rocky or metallic and are mostly found in the asteroid belt between Mars and Jupiter. Rabbits are found in the wild.",
-        expected_output="Comets and asteroids are both celestial bodies found in our solar system but differ in composition and behavior. Comets, made up of ice, dust, and small rocky particles, develop glowing comas and tails when near the Sun. In contrast, asteroids are primarily rocky or metallic and are mostly found in the asteroid belt between Mars and Jupiter. Rabbits are found in the wild.",
-    )
-    fake_metric_1_fail = FakeMetric1(_success=False)
-    fake_metric_2_fail = FakeMetric2()
-    fake_metric_1_fail.measure(test_case=test_case)
-    fake_metric_2_fail.measure(test_case=test_case)
-
-    fake_metric_1_pass = FakeMetric1()
-    fake_metric_2_pass = FakeMetric2()
-    fake_metric_1_pass.measure(test_case=test_case)
-    fake_metric_2_pass.measure(test_case=test_case)
-    test_results = [
-        TestResult(
-            success=False,
-            metrics=[
-                fake_metric_1_fail,
-                fake_metric_2_fail,
-            ],
-            input="some input",
-            actual_output="some output",
-            expected_output="expected output",
-            context=["context"],
-            retrieval_context=["retrieval context"],
-        ),
-        TestResult(
-            success=True,
-            metrics=[
-                fake_metric_1_pass,
-                fake_metric_2_pass,
-            ],
-            input="another input",
-            actual_output="another output",
-            expected_output="another expected output",
-            context=["another context"],
-            retrieval_context=["another retrieval context"],
-        ),
-    ]
-
-    expected_result = {"FakeMetric1": 0.5, "FakeMetric2": 1.0}
-    result = aggregate_metric_pass_rates(test_results)
-    assert result == expected_result