confident-ai · Jun 12, 2025
diff --git a/‎deepeval/metrics/__init__.py
Lines changed: 1 addition & 0 deletions b/‎deepeval/metrics/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎deepeval/metrics/multimodal_metrics/__init__.py
Lines changed: 1 addition & 0 deletions b/‎deepeval/metrics/multimodal_metrics/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py b/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py
diff --git a/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py
Lines changed: 350 additions & 0 deletions b/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py
Lines changed: 350 additions & 0 deletions
diff --git a/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py
Lines changed: 11 additions & 0 deletions b/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py
Lines changed: 136 additions & 0 deletions b/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py
Lines changed: 136 additions & 0 deletions
diff --git a/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py
Lines changed: 63 additions & 0 deletions b/‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py
Lines changed: 63 additions & 0 deletions
diff --git a/‎deepeval/models/llms/openai_model.py
Lines changed: 1 addition & 1 deletion b/‎deepeval/models/llms/openai_model.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepeval/models/mlllms/openai_model.py
Lines changed: 69 additions & 15 deletions b/‎deepeval/models/mlllms/openai_model.py
Lines changed: 69 additions & 15 deletions
diff --git a/‎tests/test_image_metrics.py
Lines changed: 23 additions & 1 deletion b/‎tests/test_image_metrics.py
Lines changed: 23 additions & 1 deletion
@@ -42,4 +42,5 @@
     MultimodalAnswerRelevancyMetric,
     MultimodalFaithfulnessMetric,
     MultimodalToolCorrectnessMetric,
+    MultimodalGEval,
 )
@@ -21,3 +21,4 @@
 from .multimodal_tool_correctness.multimodal_tool_correctness import (
     MultimodalToolCorrectnessMetric,
 )
+from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval
@@ -0,0 +1,350 @@
+"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
+
+from typing import Optional, List, Tuple, Union
+from deepeval.models import DeepEvalBaseMLLM
+from deepeval.metrics import BaseMultimodalMetric
+from deepeval.test_case import (
+    MLLMTestCaseParams,
+    MLLMTestCase,
+)
+from deepeval.metrics.multimodal_metrics.multimodal_g_eval.template import MultimodalGEvalTemplate
+from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import *
+from deepeval.utils import get_or_create_event_loop, prettify_list
+from deepeval.metrics.indicator import metric_progress_indicator
+from deepeval.metrics.utils import (
+    initialize_multimodal_model,
+    check_mllm_test_case_params,
+    construct_verbose_logs,
+    trimAndLoadJson,
+)
+from deepeval.metrics.multimodal_metrics.multimodal_g_eval.utils import (
+    construct_test_case_list,
+    no_multimodal_log_prob_support,
+    construct_g_eval_params_string,
+)
+from deepeval.metrics.g_eval.utils import (
+    Rubric,
+    format_rubrics,
+    calculate_weighted_summed_score,
+    validate_and_sort_rubrics,
+    validate_criteria_and_evaluation_steps,
+    number_evaluation_steps,
+    get_score_range,
+)
+
+
+class MultimodalGEval(BaseMultimodalMetric):
+    def __init__(
+        self,
+        name: str,
+        evaluation_params: List[MLLMTestCaseParams],
+        criteria: Optional[str] = None,
+        evaluation_steps: Optional[List[str]] = None,
+        rubric: Optional[List[Rubric]] = None,
+        model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
+        threshold: float = 0.5,
+        top_logprobs: int = 20,
+        async_mode: bool = True,
+        strict_mode: bool = False,
+        verbose_mode: bool = False,
+        _include_g_eval_suffix: bool = True,
+    ):
+        validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
+        self.name = name
+        self.evaluation_params = evaluation_params
+        self.criteria = criteria
+        self.rubric = validate_and_sort_rubrics(rubric)
+        self.model, self.using_native_model = initialize_multimodal_model(model)
+        self.evaluation_model = self.model.get_model_name()
+        self.evaluation_steps = evaluation_steps
+        self.threshold = 1 if strict_mode else threshold
+        self.top_logprobs = top_logprobs
+        self.strict_mode = strict_mode
+        self.async_mode = async_mode
+        self.verbose_mode = verbose_mode
+        self._include_g_eval_suffix = _include_g_eval_suffix
+
+    def measure(
+        self,
+        test_case: MLLMTestCase,
+        _show_indicator: bool = True,
+        _in_component: bool = False,
+        _additional_context: Optional[str] = None,
+    ) -> float:
+
+        check_mllm_test_case_params(test_case, self.evaluation_params, None, None, self)
+
+        self.evaluation_cost = 0 if self.using_native_model else None
+        with metric_progress_indicator(
+            self, _show_indicator=_show_indicator, _in_component=_in_component
+        ):
+            if self.async_mode:
+                loop = get_or_create_event_loop()
+                loop.run_until_complete(
+                    self.a_measure(
+                        test_case,
+                        _show_indicator=False,
+                        _in_component=_in_component,
+                        _additional_context=_additional_context,
+                    )
+                )
+            else:
+                self.evaluation_steps: List[str] = (
+                    self._generate_evaluation_steps()
+                )
+                g_score, reason = self._evaluate(
+                    test_case, _additional_context=_additional_context
+                )
+                self.reason = reason
+                self.score = float(g_score) / 10
+                self.score = (
+                    0
+                    if self.strict_mode and self.score < self.threshold
+                    else self.score
+                )
+                self.success = self.score >= self.threshold
+                self.verbose_logs = construct_verbose_logs(
+                    self,
+                    steps=[
+                        f"Criteria:\n{self.criteria}",
+                        f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}",
+                        f"Rubric:\n{format_rubrics(self.rubric)}",
+                        f"Score: {self.score}\nReason: {self.reason}",
+                    ],
+                )
+
+            return self.score
+
+    async def a_measure(
+        self,
+        test_case: MLLMTestCase,
+        _show_indicator: bool = True,
+        _in_component: bool = False,
+        _additional_context: Optional[str] = None,
+    ) -> float:
+
+        check_mllm_test_case_params(test_case, self.evaluation_params, None, None, self)
+
+        self.evaluation_cost = 0 if self.using_native_model else None
+        with metric_progress_indicator(
+            self,
+            async_mode=True,
+            _show_indicator=_show_indicator,
+            _in_component=_in_component,
+        ):
+            self.evaluation_steps: List[str] = (
+                await self._a_generate_evaluation_steps()
+            )
+            g_score, reason = await self._a_evaluate(
+                test_case, _additional_context=_additional_context
+            )
+            self.reason = reason
+            self.score = (
+                float(g_score) / 10 if not self.strict_mode else int(g_score)
+            )
+            self.success = self.score >= self.threshold
+            self.verbose_logs = construct_verbose_logs(
+                self,
+                steps=[
+                    f"Criteria:\n{self.criteria}",
+                    f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}",
+                    f"Rubric:\n{format_rubrics(self.rubric)}",
+                    f"Score: {self.score}\nReason: {self.reason}",
+                ],
+            )
+            return self.score
+
+    async def _a_generate_evaluation_steps(self) -> List[str]:
+        if self.evaluation_steps:
+            return self.evaluation_steps
+
+        g_eval_params_str = construct_g_eval_params_string(
+            self.evaluation_params
+        )
+        prompt = MultimodalGEvalTemplate.generate_evaluation_steps(
+            criteria=self.criteria, parameters=g_eval_params_str
+        )
+        if self.using_native_model:
+            res, cost = await self.model.a_generate([prompt], schema=Steps)
+            self.evaluation_cost += cost
+            return res.steps
+        else:
+            try:
+                res: Steps = await self.model.a_generate([prompt], schema=Steps)
+                return res.steps
+            except TypeError:
+                res = await self.model.a_generate([prompt])
+                data = trimAndLoadJson(res, self)
+                return data["steps"]
+
+    def _generate_evaluation_steps(self) -> List[str]:
+        if self.evaluation_steps:
+            return self.evaluation_steps
+
+        g_eval_params_str = construct_g_eval_params_string(
+            self.evaluation_params
+        )
+        prompt = MultimodalGEvalTemplate.generate_evaluation_steps(
+            criteria=self.criteria, parameters=g_eval_params_str
+        )
+        if self.using_native_model:
+            res, cost = self.model.generate([prompt], schema=Steps)
+            self.evaluation_cost += cost
+            return res.steps
+        else:
+            try:
+                res: Steps = self.model.generate([prompt], schema=Steps)
+                return res.steps
+            except TypeError:
+                res = self.model.generate([prompt])
+                data = trimAndLoadJson(res, self)
+                return data["steps"]
+
+    async def _a_evaluate(
+        self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
+    ) -> Tuple[Union[int, float], str]:
+        test_case_list= construct_test_case_list(self.evaluation_params, test_case)
+        g_eval_params_str = construct_g_eval_params_string(self.evaluation_params)
+
+        if not self.strict_mode:
+            rubric_str = format_rubrics(self.rubric) if self.rubric else None
+            prompt = MultimodalGEvalTemplate.generate_evaluation_results(
+                evaluation_steps=number_evaluation_steps(self.evaluation_steps),
+                test_case_list=test_case_list,
+                parameters=g_eval_params_str,
+                rubric=rubric_str,
+                score_range=get_score_range(self.rubric),
+                _additional_context=_additional_context,
+            )
+        else:
+            prompt = MultimodalGEvalTemplate.generate_strict_evaluation_results(
+                evaluation_steps=number_evaluation_steps(self.evaluation_steps),
+                test_case_list=test_case_list,
+                parameters=g_eval_params_str,
+                _additional_context=_additional_context,
+            )
+        try:
+            # don't use log probabilities for unsupported gpt models
+            if no_multimodal_log_prob_support(self.model):
+                raise AttributeError("log_probs unsupported.")
+
+            # Don't have to check for using native model
+            # since generate raw response only exist for deepeval's native model
+            res, cost = await self.model.a_generate_raw_response(
+                prompt, top_logprobs=self.top_logprobs
+            )
+            self.evaluation_cost += cost
+            data = trimAndLoadJson(res.choices[0].message.content, self)
+
+            reason = data["reason"]
+            score = data["score"]
+            if self.strict_mode:
+                return score, reason
+
+            try:
+                weighted_summed_score = calculate_weighted_summed_score(
+                    score, res
+                )
+                return weighted_summed_score, reason
+            except:
+                return score, reason
+        except (
+            AttributeError
+        ):  # This catches the case where a_generate_raw_response doesn't exist.
+            if self.using_native_model:
+                res, cost = await self.model.a_generate(prompt)
+                self.evaluation_cost += cost
+                data = trimAndLoadJson(res, self)
+                return data["score"], data["reason"]
+            else:
+                try:
+                    res: ReasonScore = await self.model.a_generate(
+                        prompt, schema=ReasonScore
+                    )
+                    return res.score, res.reason
+                except TypeError:
+                    res = await self.model.a_generate(prompt)
+                    data = trimAndLoadJson(res, self)
+                    return data["score"], data["reason"]
+
+    def _evaluate(
+        self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
+    ) -> Tuple[Union[int, float], str]:
+        test_case_list = construct_test_case_list(self.evaluation_params, test_case)
+        g_eval_params_str = construct_g_eval_params_string(self.evaluation_params)
+
+        if not self.strict_mode:
+            rubric_str = format_rubrics(self.rubric) if self.rubric else None
+            prompt = MultimodalGEvalTemplate.generate_evaluation_results(
+                evaluation_steps=number_evaluation_steps(self.evaluation_steps),
+                test_case_list=test_case_list,
+                parameters=g_eval_params_str,
+                rubric=rubric_str,
+                score_range=get_score_range(self.rubric),
+                _additional_context=_additional_context,
+            )
+        else:
+            prompt = MultimodalGEvalTemplate.generate_strict_evaluation_results(
+                evaluation_steps=number_evaluation_steps(self.evaluation_steps),
+                test_case_list=test_case_list,
+                parameters=g_eval_params_str,
+                _additional_context=_additional_context,
+            )
+
+        try:
+            # don't use log probabilities for unsupported gpt models
+            if no_multimodal_log_prob_support(self.model):
+                raise AttributeError("log_probs unsupported.")
+
+            res, cost = self.model.generate_raw_response(
+                prompt, top_logprobs=self.top_logprobs
+            )
+            self.evaluation_cost += cost
+            data = trimAndLoadJson(res.choices[0].message.content, self)
+
+            reason = data["reason"]
+            score = data["score"]
+            if self.strict_mode:
+                return score, reason
+
+            try:
+                weighted_summed_score = calculate_weighted_summed_score(
+                    score, res
+                )
+                return weighted_summed_score, reason
+            except:
+                return score, reason
+        except AttributeError:
+            # This catches the case where a_generate_raw_response doesn't exist.
+            if self.using_native_model:
+                res, cost = self.model.generate(prompt)
+                self.evaluation_cost += cost
+                data = trimAndLoadJson(res, self)
+                return data["score"], data["reason"]
+            else:
+                try:
+                    res: ReasonScore = self.model.generate(
+                        prompt, schema=ReasonScore
+                    )
+                    return res.score, res.reason
+                except TypeError:
+                    res = self.model.generate(prompt)
+                    data = trimAndLoadJson(res, self)
+                    return data["score"], data["reason"]
+
+    def is_successful(self) -> bool:
+        if self.error is not None:
+            self.success = False
+        else:
+            try:
+                self.success = self.score >= self.threshold
+            except:
+                self.success = False
+        return self.success
+
+    @property
+    def __name__(self):
+        if self._include_g_eval_suffix:
+            return f"{self.name} (GEval)"
+        else:
+            return self.name
@@ -0,0 +1,11 @@
+from typing import List
+from pydantic import BaseModel
+
+
+class ReasonScore(BaseModel):
+    reason: str
+    score: float
+
+
+class Steps(BaseModel):
+    steps: List[str]
@@ -0,0 +1,136 @@
+from typing import List, Optional, Tuple
+import textwrap
+
+
+class MultimodalGEvalTemplate:
+    
+    @staticmethod
+    def generate_evaluation_steps(parameters: str, criteria: str):
+        return textwrap.dedent(
+            f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
+
+            Evaluation Criteria:
+            {criteria}
+
+            **
+            IMPORTANT: Please make sure to only return in JSON format, with the "steps" key as a list of strings. No words or explanation is needed.
+            Example JSON:
+            {{
+                "steps": <list_of_strings>
+            }}
+            **
+
+            JSON:
+            """
+        )
+        
+    @staticmethod
+    def generate_evaluation_results(
+        evaluation_steps: str,
+        test_case_list: List,
+        parameters: str,
+        rubric: Optional[str] = None,
+        score_range: Tuple[int, int] = (0, 10),
+        _additional_context: Optional[str] = None,
+    ):
+        rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
+        dependencies = (
+            "evaluation steps and rubric" if rubric else "evaluation steps"
+        )
+        score_explanation = (
+            "based on the rubric provided"
+            if rubric
+            else f"with {score_range[1]} indicating strong alignment with the evaluation steps and {score_range[0]} indicating no alignment"
+        )
+        reasoning_expectation = (
+            "Be specific and grounded in the evaluation steps and rubric."
+            if rubric
+            else "Be specific and grounded in the evaluation steps."
+        )
+        additional_context = (
+            f"\n\nAdditional Context:\n{_additional_context}\n"
+            if _additional_context
+            else ""
+        )
+
+        return [
+            textwrap.dedent(
+                f"""You are an evaluator. Given the following {dependencies}, assess the response below and return a JSON object with two fields:
+
+                - `"score"`: an integer between {score_range[0]} and {score_range[1]}, {score_explanation}.
+                - `"reason"`: a brief explanation for why the score was given. This must mention specific strengths or shortcomings, referencing relevant details from the input. Do **not** quote the score itself in the explanation.
+
+                Your explanation should:
+                - {reasoning_expectation}
+                - Mention key details from the test case parameters.
+                - Be concise, clear, and focused on the evaluation logic.
+
+                Only return valid JSON. Do **not** include any extra commentary or text.
+
+                ---
+
+                Evaluation Steps:
+                {evaluation_steps}
+
+                {rubric_text}
+                Test Case:
+                ************************
+                """
+            )] + test_case_list + [
+            textwrap.dedent(f"""
+                ************************
+                \n\n\n
+                Parameters:
+                {parameters}
+                {additional_context}
+
+                ---
+                **Example JSON:**
+                {{
+                    "score": {score_range[0]},
+                    "reason": "your concise and informative reason here"
+                }}
+
+                JSON:
+                """
+            )
+        ]
+
+    @staticmethod
+    def generate_strict_evaluation_results(
+        evaluation_steps: str,
+        test_case_list: List,
+        parameters: str,
+        _additional_context: Optional[str] = None,
+    ):
+        additional_context = (
+            f"\n\nAdditional Context:\n{_additional_context}\n"
+            if _additional_context
+            else ""
+        )
+        return [
+            textwrap.dedent(
+                f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
+
+                Evaluation Steps:
+                {evaluation_steps}
+                ************************
+                """
+            )] + test_case_list + [
+            textwrap.dedent(f"""
+                ************************
+                {additional_context}
+                **
+                IMPORTANT: Please make sure to only return in JSON format, with the "score" and "reason" key. No words or explanation is needed.
+
+                Example JSON:
+                {{
+                    "score": 0,
+                    "reason": "The text does not follow the evaluation steps provided."
+                }}
+                **
+
+                JSON:
+                """
+            )
+        ]
@@ -0,0 +1,63 @@
+from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, ToolCall
+from deepeval.test_case.mllm_test_case import MLLMImage
+from deepeval.metrics.g_eval.utils import (
+    unsupported_log_probs_gpt_models,
+)
+from deepeval.models import (
+    DeepEvalBaseMLLM,
+    MultimodalOpenAIModel,
+)
+
+from typing import List, Union
+
+
+G_EVAL_PARAMS = {
+    MLLMTestCaseParams.INPUT: "Input",
+    MLLMTestCaseParams.ACTUAL_OUTPUT: "Actual Output",
+    MLLMTestCaseParams.EXPECTED_OUTPUT: "Expected Output",
+    MLLMTestCaseParams.CONTEXT: "Context",
+    MLLMTestCaseParams.RETRIEVAL_CONTEXT: "Retrieval Context",
+    MLLMTestCaseParams.EXPECTED_TOOLS: "Expected Tools",
+    MLLMTestCaseParams.TOOLS_CALLED: "Tools Called",
+}
+
+
+def construct_g_eval_params_string(
+    mllm_test_case_params: List[MLLMTestCaseParams],
+):
+    g_eval_params = [G_EVAL_PARAMS[param] for param in mllm_test_case_params]
+    if len(g_eval_params) == 1:
+        g_eval_params_str = g_eval_params[0]
+    elif len(g_eval_params) == 2:
+        g_eval_params_str = " and ".join(g_eval_params)
+    else:
+        g_eval_params_str = (
+            ", ".join(g_eval_params[:-1]) + ", and " + g_eval_params[-1]
+        )
+
+    return g_eval_params_str
+
+def construct_test_case_list(
+    evaluation_params: List[MLLMTestCaseParams], test_case: MLLMTestCase
+) -> List[Union[str, MLLMImage]]:
+    test_case_list = []
+    for param in evaluation_params:
+        test_case_param_list = [f"\n\n\n{G_EVAL_PARAMS[param]}:\n"]
+        value = getattr(test_case, param.value)
+        for v in value:
+            if isinstance(v, ToolCall):
+                test_case_param_list.append(repr(v))
+            else:
+                test_case_param_list.append(v)
+        test_case_list.extend(test_case_param_list)
+    return test_case_list
+
+def no_multimodal_log_prob_support(model: Union[str, DeepEvalBaseMLLM]):
+    if isinstance(model, str) and model in unsupported_log_probs_gpt_models:
+        return True
+    elif (
+        isinstance(model, MultimodalOpenAIModel)
+        and model.model_name in unsupported_log_probs_gpt_models
+    ):
+        return True
+    return False    
@@ -179,7 +179,7 @@ def __init__(
         super().__init__(model_name)
 
     ###############################################
-    # Other generate functions
+    # Generate functions
     ###############################################
 
     @retry(
 
@@ -1,5 +1,6 @@
 from typing import Optional, Tuple, List, Union
 from openai import OpenAI, AsyncOpenAI
+from openai.types.chat import ParsedChatCompletion
 from pydantic import BaseModel
 from io import BytesIO
 import logging
@@ -63,15 +64,6 @@ def log_retry_error(retry_state: RetryCallState):
     "gpt-4o-2024-11-20",
     "gpt-4o-mini",
     "gpt-4o-mini-2024-07-18",
-    "gpt-4",
-    "gpt-4-0125-preview",
-    "gpt-4-0613",
-    "gpt-4-1106-preview",
-    "gpt-4-turbo",
-    "gpt-4-turbo-2024-04-09",
-    "gpt-4-turbo-preview",
-    "gpt-4-32k",
-    "gpt-4-32k-0613",
 ]
 
 default_multimodal_gpt_model = "gpt-4o"
@@ -101,6 +93,10 @@ def __init__(
 
         super().__init__(model_name, *args, **kwargs)
 
+    ###############################################
+    # Generate functions
+    ###############################################
+
     @retry(
         wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
         retry=retry_if_exception_type(retryable_exceptions),
@@ -113,9 +109,10 @@ def generate(
     ) -> Tuple[str, float]:
         client = OpenAI(api_key=self._openai_api_key)
         prompt = self.generate_prompt(multimodal_input)
+        messages = [{"role": "user", "content": prompt}]
         response = client.beta.chat.completions.parse(
             model=self.model_name,
-            messages=[{"role": "user", "content": prompt}],
+            messages=messages,
             response_format=schema,
         )
         input_tokens = response.usage.prompt_tokens
@@ -136,9 +133,10 @@ async def a_generate(
     ) -> Tuple[str, float]:
         client = AsyncOpenAI(api_key=self._openai_api_key)
         prompt = self.generate_prompt(multimodal_input)
+        messages = [{"role": "user", "content": prompt}]
         response = await client.beta.chat.completions.parse(
             model=self.model_name,
-            messages=[{"role": "user", "content": prompt}],
+            messages=messages,
             response_format=schema,
         )
         input_tokens = response.usage.prompt_tokens
@@ -147,6 +145,64 @@ async def a_generate(
         generated_text = response.choices[0].message.parsed
         return generated_text, total_cost
 
+    ###############################################
+    # Other generate functions
+    ###############################################
+
+    @retry(
+        wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
+        retry=retry_if_exception_type(retryable_exceptions),
+        after=log_retry_error,
+    )
+    def generate_raw_response(
+        self,
+        multimodal_input: List[Union[str, MLLMImage]],
+        top_logprobs: int = 5,
+    ) -> Tuple[ParsedChatCompletion, float]:
+        client = OpenAI(api_key=self._openai_api_key)
+        prompt = self.generate_prompt(multimodal_input)
+        messages = [{"role": "user", "content": prompt}]
+        completion = client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            logprobs=True,
+            top_logprobs=top_logprobs,
+        )
+        # Cost calculation
+        input_tokens = completion.usage.prompt_tokens
+        output_tokens = completion.usage.completion_tokens
+        cost = self.calculate_cost(input_tokens, output_tokens)
+        return completion, cost
+
+    @retry(
+        wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
+        retry=retry_if_exception_type(retryable_exceptions),
+        after=log_retry_error,
+    )
+    async def a_generate_raw_response(
+        self,
+        multimodal_input: List[Union[str, MLLMImage]],
+        top_logprobs: int = 5,
+    ) -> Tuple[ParsedChatCompletion, float]:
+        client = AsyncOpenAI(api_key=self._openai_api_key)
+        prompt = self.generate_prompt(multimodal_input)
+        messages = [{"role": "user", "content": prompt}]
+        completion = await client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            logprobs=True,
+            top_logprobs=top_logprobs,
+        )
+        # Cost calculation
+        input_tokens = completion.usage.prompt_tokens
+        output_tokens = completion.usage.completion_tokens
+        cost = self.calculate_cost(input_tokens, output_tokens)
+        return completion, cost
+
+    ###############################################
+    # Utilities
+    ###############################################
+
     def generate_prompt(
         self, multimodal_input: List[Union[str, MLLMImage]] = []
     ):
@@ -173,10 +229,6 @@ def generate_prompt(
                 prompt.append(visual_dict)
         return prompt
 
-    ###############################################
-    # Utilities
-    ###############################################
-
     def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
         pricing = model_pricing.get(
             self.model_name, model_pricing["gpt-4o"]
@@ -187,6 +239,8 @@ def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
 
     def encode_pil_image(self, pil_image):
         image_buffer = BytesIO()
+        if pil_image.mode in ("RGBA", "LA", "P"):
+            pil_image = pil_image.convert("RGB")
         pil_image.save(image_buffer, format="JPEG")
         image_bytes = image_buffer.getvalue()
         base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
 
@@ -3,7 +3,7 @@
 from deepeval.dataset import EvaluationDataset
 from deepeval import evaluate
 from deepeval.evaluate.configs import AsyncConfig, DisplayConfig
-from deepeval.test_case import MLLMTestCase, LLMTestCase, MLLMImage
+from deepeval.test_case import MLLMTestCase, LLMTestCase, MLLMImage, MLLMTestCaseParams, ToolCall
 from deepeval.metrics import (
     AnswerRelevancyMetric,
     ImageEditingMetric,
@@ -13,6 +13,7 @@
     MultimodalContextualPrecisionMetric,
     MultimodalAnswerRelevancyMetric,
     MultimodalFaithfulnessMetric,
+    MultimodalGEval
 )
 
 image_path = "./data/image_1.jpg"
@@ -93,6 +94,8 @@ def multimodal_rag_case():
                 url="https://upload.wikimedia.org/wikipedia/commons/thumb/0/03/Liberty-from-behind-2024.jpg/330px-Liberty-from-behind-2024.jpg"
             ),
         ],
+        tools_called=[ToolCall(name="google search")],
+        expected_tools=[ToolCall(name="google search")],
     )
 
 
@@ -128,6 +131,25 @@ def test_multimodal_rag_case(multimodal_rag_case):
             MultimodalContextualPrecisionMetric(),
             MultimodalAnswerRelevancyMetric(),
             MultimodalFaithfulnessMetric(),
+            MultimodalGEval(
+                name="MultimodalGEval",
+                evaluation_params=[
+                    MLLMTestCaseParams.INPUT,
+                    MLLMTestCaseParams.ACTUAL_OUTPUT,
+                    MLLMTestCaseParams.EXPECTED_OUTPUT,
+                    MLLMTestCaseParams.CONTEXT,
+                    MLLMTestCaseParams.RETRIEVAL_CONTEXT,
+                    MLLMTestCaseParams.TOOLS_CALLED,
+                    MLLMTestCaseParams.EXPECTED_TOOLS,
+                ],
+                model="gpt-4o",
+                evaluation_steps=[
+                    "Determine if the output image follows the input instructions clearly.",
+                    "Determine if the expected output aligns with the actual output.",
+                    "Determine if the context is aligned with the retrieval context.",
+                    "Determine if the tools called are aligned with the expected tools.",
+                ],
+            ),
         ],
         display_config=DisplayConfig(verbose_mode=True),
         async_config=AsyncConfig(run_async=False),
Original file line number	Diff line number	Diff line change
`@@ -42,4 +42,5 @@`
`42`	`42`	`MultimodalAnswerRelevancyMetric,`
`43`	`43`	`MultimodalFaithfulnessMetric,`
`44`	`44`	`MultimodalToolCorrectnessMetric,`
	`45`	`+ MultimodalGEval,`
`45`	`46`	`)`
Original file line number	Diff line number	Diff line change
`@@ -21,3 +21,4 @@`
`21`	`21`	`from .multimodal_tool_correctness.multimodal_tool_correctness import (`
`22`	`22`	`MultimodalToolCorrectnessMetric,`
`23`	`23`	`)`
	`24`	`+from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval`