fix tool calling

penguine-ip · penguine-ip · commit 70393465438d · 2025-02-20T00:10:59.000-08:00
diff --git a/deepeval/metrics/tool_correctness/tool_correctness.py b/deepeval/metrics/tool_correctness/tool_correctness.py
@@ -13,7 +13,6 @@
     ToolCall,
 )
 from deepeval.metrics import BaseMetric
-import json
 
 required_params: List[LLMTestCaseParams] = [
     LLMTestCaseParams.INPUT,
@@ -27,16 +26,13 @@ class ToolCorrectnessMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.5,
-        evaluation_params: List[ToolCallParams] = [ToolCallParams.TOOL],
+        evaluation_params: List[ToolCallParams] = [],
         include_reason: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
         should_exact_match: bool = False,
         should_consider_ordering: bool = False,
     ):
-        assert (
-            ToolCallParams.TOOL in evaluation_params
-        ), "evaluation_params must include ToolCallParams.TOOL"
         self.threshold = 1 if strict_mode else threshold
         self.include_reason = include_reason
         self.strict_mode = strict_mode
diff --git a/deepeval/test_case/llm_test_case.py b/deepeval/test_case/llm_test_case.py
@@ -16,7 +16,6 @@ class LLMTestCaseParams(Enum):
 
 
 class ToolCallParams(Enum):
-    TOOL = "tool"
     INPUT_PARAMETERS = "input_parameters"
     OUTPUT = "output"
 
@@ -40,8 +39,16 @@ def __eq__(self, other):
         )
 
     def __hash__(self):
+        input_params = (
+            self.input_parameters if self.input_parameters is not None else {}
+        )
+        output_hashable = (
+            frozenset(self.output.items())
+            if isinstance(self.output, dict)
+            else self.output
+        )
         return hash(
-            (self.name, frozenset(self.input_parameters.items()), self.output)
+            (self.name, frozenset(input_params.items()), output_hashable)
         )
 
     def __repr__(self):
diff --git a/tests/test_tools_called.py b/tests/test_tools_called.py
@@ -36,16 +36,16 @@
 from deepeval import evaluate
 
 tool_correctness_metric = ToolCorrectnessMetric(verbose_mode=True)
-tool_correctness_geval_metric = GEval(
-    name="Tool Correctness",
-    criteria="Is the expected tools same as tools called",
-    evaluation_params=[
-        LLMTestCaseParams.TOOLS_CALLED,
-        LLMTestCaseParams.EXPECTED_TOOLS,
-    ],
-    verbose_mode=True,
-)
-tool_correctness_geval_metric.measure(tool_test_case)
+# tool_correctness_geval_metric = GEval(
+#     name="Tool Correctness",
+#     criteria="Is the expected tools same as tools called",
+#     evaluation_params=[
+#         LLMTestCaseParams.TOOLS_CALLED,
+#         LLMTestCaseParams.EXPECTED_TOOLS,
+#     ],
+#     verbose_mode=True,
+# )
+# tool_correctness_geval_metric.measure(tool_test_case)
 tool_correctness_metric.measure(tool_test_case)
 
 ################################################