Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e344c1b

Browse files
authoredJun 12, 2025··
Merge pull request #1684 from confident-ai/multimodal-geval
Multimodal G-Eval
2 parents ee0f7bf + dc52e91 commit e344c1b

File tree

10 files changed

+655
-17
lines changed

10 files changed

+655
-17
lines changed
 

‎deepeval/metrics/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,5 @@
4242
MultimodalAnswerRelevancyMetric,
4343
MultimodalFaithfulnessMetric,
4444
MultimodalToolCorrectnessMetric,
45+
MultimodalGEval,
4546
)

‎deepeval/metrics/multimodal_metrics/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@
2121
from .multimodal_tool_correctness.multimodal_tool_correctness import (
2222
MultimodalToolCorrectnessMetric,
2323
)
24+
from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval

‎deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py

Whitespace-only changes.
Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2+
3+
from typing import Optional, List, Tuple, Union
4+
from deepeval.models import DeepEvalBaseMLLM
5+
from deepeval.metrics import BaseMultimodalMetric
6+
from deepeval.test_case import (
7+
MLLMTestCaseParams,
8+
MLLMTestCase,
9+
)
10+
from deepeval.metrics.multimodal_metrics.multimodal_g_eval.template import MultimodalGEvalTemplate
11+
from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import *
12+
from deepeval.utils import get_or_create_event_loop, prettify_list
13+
from deepeval.metrics.indicator import metric_progress_indicator
14+
from deepeval.metrics.utils import (
15+
initialize_multimodal_model,
16+
check_mllm_test_case_params,
17+
construct_verbose_logs,
18+
trimAndLoadJson,
19+
)
20+
from deepeval.metrics.multimodal_metrics.multimodal_g_eval.utils import (
21+
construct_test_case_list,
22+
no_multimodal_log_prob_support,
23+
construct_g_eval_params_string,
24+
)
25+
from deepeval.metrics.g_eval.utils import (
26+
Rubric,
27+
format_rubrics,
28+
calculate_weighted_summed_score,
29+
validate_and_sort_rubrics,
30+
validate_criteria_and_evaluation_steps,
31+
number_evaluation_steps,
32+
get_score_range,
33+
)
34+
35+
36+
class MultimodalGEval(BaseMultimodalMetric):
37+
def __init__(
38+
self,
39+
name: str,
40+
evaluation_params: List[MLLMTestCaseParams],
41+
criteria: Optional[str] = None,
42+
evaluation_steps: Optional[List[str]] = None,
43+
rubric: Optional[List[Rubric]] = None,
44+
model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
45+
threshold: float = 0.5,
46+
top_logprobs: int = 20,
47+
async_mode: bool = True,
48+
strict_mode: bool = False,
49+
verbose_mode: bool = False,
50+
_include_g_eval_suffix: bool = True,
51+
):
52+
validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
53+
self.name = name
54+
self.evaluation_params = evaluation_params
55+
self.criteria = criteria
56+
self.rubric = validate_and_sort_rubrics(rubric)
57+
self.model, self.using_native_model = initialize_multimodal_model(model)
58+
self.evaluation_model = self.model.get_model_name()
59+
self.evaluation_steps = evaluation_steps
60+
self.threshold = 1 if strict_mode else threshold
61+
self.top_logprobs = top_logprobs
62+
self.strict_mode = strict_mode
63+
self.async_mode = async_mode
64+
self.verbose_mode = verbose_mode
65+
self._include_g_eval_suffix = _include_g_eval_suffix
66+
67+
def measure(
68+
self,
69+
test_case: MLLMTestCase,
70+
_show_indicator: bool = True,
71+
_in_component: bool = False,
72+
_additional_context: Optional[str] = None,
73+
) -> float:
74+
75+
check_mllm_test_case_params(test_case, self.evaluation_params, None, None, self)
76+
77+
self.evaluation_cost = 0 if self.using_native_model else None
78+
with metric_progress_indicator(
79+
self, _show_indicator=_show_indicator, _in_component=_in_component
80+
):
81+
if self.async_mode:
82+
loop = get_or_create_event_loop()
83+
loop.run_until_complete(
84+
self.a_measure(
85+
test_case,
86+
_show_indicator=False,
87+
_in_component=_in_component,
88+
_additional_context=_additional_context,
89+
)
90+
)
91+
else:
92+
self.evaluation_steps: List[str] = (
93+
self._generate_evaluation_steps()
94+
)
95+
g_score, reason = self._evaluate(
96+
test_case, _additional_context=_additional_context
97+
)
98+
self.reason = reason
99+
self.score = float(g_score) / 10
100+
self.score = (
101+
0
102+
if self.strict_mode and self.score < self.threshold
103+
else self.score
104+
)
105+
self.success = self.score >= self.threshold
106+
self.verbose_logs = construct_verbose_logs(
107+
self,
108+
steps=[
109+
f"Criteria:\n{self.criteria}",
110+
f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}",
111+
f"Rubric:\n{format_rubrics(self.rubric)}",
112+
f"Score: {self.score}\nReason: {self.reason}",
113+
],
114+
)
115+
116+
return self.score
117+
118+
async def a_measure(
119+
self,
120+
test_case: MLLMTestCase,
121+
_show_indicator: bool = True,
122+
_in_component: bool = False,
123+
_additional_context: Optional[str] = None,
124+
) -> float:
125+
126+
check_mllm_test_case_params(test_case, self.evaluation_params, None, None, self)
127+
128+
self.evaluation_cost = 0 if self.using_native_model else None
129+
with metric_progress_indicator(
130+
self,
131+
async_mode=True,
132+
_show_indicator=_show_indicator,
133+
_in_component=_in_component,
134+
):
135+
self.evaluation_steps: List[str] = (
136+
await self._a_generate_evaluation_steps()
137+
)
138+
g_score, reason = await self._a_evaluate(
139+
test_case, _additional_context=_additional_context
140+
)
141+
self.reason = reason
142+
self.score = (
143+
float(g_score) / 10 if not self.strict_mode else int(g_score)
144+
)
145+
self.success = self.score >= self.threshold
146+
self.verbose_logs = construct_verbose_logs(
147+
self,
148+
steps=[
149+
f"Criteria:\n{self.criteria}",
150+
f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}",
151+
f"Rubric:\n{format_rubrics(self.rubric)}",
152+
f"Score: {self.score}\nReason: {self.reason}",
153+
],
154+
)
155+
return self.score
156+
157+
async def _a_generate_evaluation_steps(self) -> List[str]:
158+
if self.evaluation_steps:
159+
return self.evaluation_steps
160+
161+
g_eval_params_str = construct_g_eval_params_string(
162+
self.evaluation_params
163+
)
164+
prompt = MultimodalGEvalTemplate.generate_evaluation_steps(
165+
criteria=self.criteria, parameters=g_eval_params_str
166+
)
167+
if self.using_native_model:
168+
res, cost = await self.model.a_generate([prompt], schema=Steps)
169+
self.evaluation_cost += cost
170+
return res.steps
171+
else:
172+
try:
173+
res: Steps = await self.model.a_generate([prompt], schema=Steps)
174+
return res.steps
175+
except TypeError:
176+
res = await self.model.a_generate([prompt])
177+
data = trimAndLoadJson(res, self)
178+
return data["steps"]
179+
180+
def _generate_evaluation_steps(self) -> List[str]:
181+
if self.evaluation_steps:
182+
return self.evaluation_steps
183+
184+
g_eval_params_str = construct_g_eval_params_string(
185+
self.evaluation_params
186+
)
187+
prompt = MultimodalGEvalTemplate.generate_evaluation_steps(
188+
criteria=self.criteria, parameters=g_eval_params_str
189+
)
190+
if self.using_native_model:
191+
res, cost = self.model.generate([prompt], schema=Steps)
192+
self.evaluation_cost += cost
193+
return res.steps
194+
else:
195+
try:
196+
res: Steps = self.model.generate([prompt], schema=Steps)
197+
return res.steps
198+
except TypeError:
199+
res = self.model.generate([prompt])
200+
data = trimAndLoadJson(res, self)
201+
return data["steps"]
202+
203+
async def _a_evaluate(
204+
self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
205+
) -> Tuple[Union[int, float], str]:
206+
test_case_list= construct_test_case_list(self.evaluation_params, test_case)
207+
g_eval_params_str = construct_g_eval_params_string(self.evaluation_params)
208+
209+
if not self.strict_mode:
210+
rubric_str = format_rubrics(self.rubric) if self.rubric else None
211+
prompt = MultimodalGEvalTemplate.generate_evaluation_results(
212+
evaluation_steps=number_evaluation_steps(self.evaluation_steps),
213+
test_case_list=test_case_list,
214+
parameters=g_eval_params_str,
215+
rubric=rubric_str,
216+
score_range=get_score_range(self.rubric),
217+
_additional_context=_additional_context,
218+
)
219+
else:
220+
prompt = MultimodalGEvalTemplate.generate_strict_evaluation_results(
221+
evaluation_steps=number_evaluation_steps(self.evaluation_steps),
222+
test_case_list=test_case_list,
223+
parameters=g_eval_params_str,
224+
_additional_context=_additional_context,
225+
)
226+
try:
227+
# don't use log probabilities for unsupported gpt models
228+
if no_multimodal_log_prob_support(self.model):
229+
raise AttributeError("log_probs unsupported.")
230+
231+
# Don't have to check for using native model
232+
# since generate raw response only exist for deepeval's native model
233+
res, cost = await self.model.a_generate_raw_response(
234+
prompt, top_logprobs=self.top_logprobs
235+
)
236+
self.evaluation_cost += cost
237+
data = trimAndLoadJson(res.choices[0].message.content, self)
238+
239+
reason = data["reason"]
240+
score = data["score"]
241+
if self.strict_mode:
242+
return score, reason
243+
244+
try:
245+
weighted_summed_score = calculate_weighted_summed_score(
246+
score, res
247+
)
248+
return weighted_summed_score, reason
249+
except:
250+
return score, reason
251+
except (
252+
AttributeError
253+
): # This catches the case where a_generate_raw_response doesn't exist.
254+
if self.using_native_model:
255+
res, cost = await self.model.a_generate(prompt)
256+
self.evaluation_cost += cost
257+
data = trimAndLoadJson(res, self)
258+
return data["score"], data["reason"]
259+
else:
260+
try:
261+
res: ReasonScore = await self.model.a_generate(
262+
prompt, schema=ReasonScore
263+
)
264+
return res.score, res.reason
265+
except TypeError:
266+
res = await self.model.a_generate(prompt)
267+
data = trimAndLoadJson(res, self)
268+
return data["score"], data["reason"]
269+
270+
def _evaluate(
271+
self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
272+
) -> Tuple[Union[int, float], str]:
273+
test_case_list = construct_test_case_list(self.evaluation_params, test_case)
274+
g_eval_params_str = construct_g_eval_params_string(self.evaluation_params)
275+
276+
if not self.strict_mode:
277+
rubric_str = format_rubrics(self.rubric) if self.rubric else None
278+
prompt = MultimodalGEvalTemplate.generate_evaluation_results(
279+
evaluation_steps=number_evaluation_steps(self.evaluation_steps),
280+
test_case_list=test_case_list,
281+
parameters=g_eval_params_str,
282+
rubric=rubric_str,
283+
score_range=get_score_range(self.rubric),
284+
_additional_context=_additional_context,
285+
)
286+
else:
287+
prompt = MultimodalGEvalTemplate.generate_strict_evaluation_results(
288+
evaluation_steps=number_evaluation_steps(self.evaluation_steps),
289+
test_case_list=test_case_list,
290+
parameters=g_eval_params_str,
291+
_additional_context=_additional_context,
292+
)
293+
294+
try:
295+
# don't use log probabilities for unsupported gpt models
296+
if no_multimodal_log_prob_support(self.model):
297+
raise AttributeError("log_probs unsupported.")
298+
299+
res, cost = self.model.generate_raw_response(
300+
prompt, top_logprobs=self.top_logprobs
301+
)
302+
self.evaluation_cost += cost
303+
data = trimAndLoadJson(res.choices[0].message.content, self)
304+
305+
reason = data["reason"]
306+
score = data["score"]
307+
if self.strict_mode:
308+
return score, reason
309+
310+
try:
311+
weighted_summed_score = calculate_weighted_summed_score(
312+
score, res
313+
)
314+
return weighted_summed_score, reason
315+
except:
316+
return score, reason
317+
except AttributeError:
318+
# This catches the case where a_generate_raw_response doesn't exist.
319+
if self.using_native_model:
320+
res, cost = self.model.generate(prompt)
321+
self.evaluation_cost += cost
322+
data = trimAndLoadJson(res, self)
323+
return data["score"], data["reason"]
324+
else:
325+
try:
326+
res: ReasonScore = self.model.generate(
327+
prompt, schema=ReasonScore
328+
)
329+
return res.score, res.reason
330+
except TypeError:
331+
res = self.model.generate(prompt)
332+
data = trimAndLoadJson(res, self)
333+
return data["score"], data["reason"]
334+
335+
def is_successful(self) -> bool:
336+
if self.error is not None:
337+
self.success = False
338+
else:
339+
try:
340+
self.success = self.score >= self.threshold
341+
except:
342+
self.success = False
343+
return self.success
344+
345+
@property
346+
def __name__(self):
347+
if self._include_g_eval_suffix:
348+
return f"{self.name} (GEval)"
349+
else:
350+
return self.name
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from typing import List
2+
from pydantic import BaseModel
3+
4+
5+
class ReasonScore(BaseModel):
6+
reason: str
7+
score: float
8+
9+
10+
class Steps(BaseModel):
11+
steps: List[str]
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
from typing import List, Optional, Tuple
2+
import textwrap
3+
4+
5+
class MultimodalGEvalTemplate:
6+
7+
@staticmethod
8+
def generate_evaluation_steps(parameters: str, criteria: str):
9+
return textwrap.dedent(
10+
f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
11+
12+
Evaluation Criteria:
13+
{criteria}
14+
15+
**
16+
IMPORTANT: Please make sure to only return in JSON format, with the "steps" key as a list of strings. No words or explanation is needed.
17+
Example JSON:
18+
{{
19+
"steps": <list_of_strings>
20+
}}
21+
**
22+
23+
JSON:
24+
"""
25+
)
26+
27+
@staticmethod
28+
def generate_evaluation_results(
29+
evaluation_steps: str,
30+
test_case_list: List,
31+
parameters: str,
32+
rubric: Optional[str] = None,
33+
score_range: Tuple[int, int] = (0, 10),
34+
_additional_context: Optional[str] = None,
35+
):
36+
rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
37+
dependencies = (
38+
"evaluation steps and rubric" if rubric else "evaluation steps"
39+
)
40+
score_explanation = (
41+
"based on the rubric provided"
42+
if rubric
43+
else f"with {score_range[1]} indicating strong alignment with the evaluation steps and {score_range[0]} indicating no alignment"
44+
)
45+
reasoning_expectation = (
46+
"Be specific and grounded in the evaluation steps and rubric."
47+
if rubric
48+
else "Be specific and grounded in the evaluation steps."
49+
)
50+
additional_context = (
51+
f"\n\nAdditional Context:\n{_additional_context}\n"
52+
if _additional_context
53+
else ""
54+
)
55+
56+
return [
57+
textwrap.dedent(
58+
f"""You are an evaluator. Given the following {dependencies}, assess the response below and return a JSON object with two fields:
59+
60+
- `"score"`: an integer between {score_range[0]} and {score_range[1]}, {score_explanation}.
61+
- `"reason"`: a brief explanation for why the score was given. This must mention specific strengths or shortcomings, referencing relevant details from the input. Do **not** quote the score itself in the explanation.
62+
63+
Your explanation should:
64+
- {reasoning_expectation}
65+
- Mention key details from the test case parameters.
66+
- Be concise, clear, and focused on the evaluation logic.
67+
68+
Only return valid JSON. Do **not** include any extra commentary or text.
69+
70+
---
71+
72+
Evaluation Steps:
73+
{evaluation_steps}
74+
75+
{rubric_text}
76+
Test Case:
77+
************************
78+
"""
79+
)] + test_case_list + [
80+
textwrap.dedent(f"""
81+
************************
82+
\n\n\n
83+
Parameters:
84+
{parameters}
85+
{additional_context}
86+
87+
---
88+
**Example JSON:**
89+
{{
90+
"score": {score_range[0]},
91+
"reason": "your concise and informative reason here"
92+
}}
93+
94+
JSON:
95+
"""
96+
)
97+
]
98+
99+
@staticmethod
100+
def generate_strict_evaluation_results(
101+
evaluation_steps: str,
102+
test_case_list: List,
103+
parameters: str,
104+
_additional_context: Optional[str] = None,
105+
):
106+
additional_context = (
107+
f"\n\nAdditional Context:\n{_additional_context}\n"
108+
if _additional_context
109+
else ""
110+
)
111+
return [
112+
textwrap.dedent(
113+
f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
114+
115+
Evaluation Steps:
116+
{evaluation_steps}
117+
************************
118+
"""
119+
)] + test_case_list + [
120+
textwrap.dedent(f"""
121+
************************
122+
{additional_context}
123+
**
124+
IMPORTANT: Please make sure to only return in JSON format, with the "score" and "reason" key. No words or explanation is needed.
125+
126+
Example JSON:
127+
{{
128+
"score": 0,
129+
"reason": "The text does not follow the evaluation steps provided."
130+
}}
131+
**
132+
133+
JSON:
134+
"""
135+
)
136+
]
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, ToolCall
2+
from deepeval.test_case.mllm_test_case import MLLMImage
3+
from deepeval.metrics.g_eval.utils import (
4+
unsupported_log_probs_gpt_models,
5+
)
6+
from deepeval.models import (
7+
DeepEvalBaseMLLM,
8+
MultimodalOpenAIModel,
9+
)
10+
11+
from typing import List, Union
12+
13+
14+
G_EVAL_PARAMS = {
15+
MLLMTestCaseParams.INPUT: "Input",
16+
MLLMTestCaseParams.ACTUAL_OUTPUT: "Actual Output",
17+
MLLMTestCaseParams.EXPECTED_OUTPUT: "Expected Output",
18+
MLLMTestCaseParams.CONTEXT: "Context",
19+
MLLMTestCaseParams.RETRIEVAL_CONTEXT: "Retrieval Context",
20+
MLLMTestCaseParams.EXPECTED_TOOLS: "Expected Tools",
21+
MLLMTestCaseParams.TOOLS_CALLED: "Tools Called",
22+
}
23+
24+
25+
def construct_g_eval_params_string(
26+
mllm_test_case_params: List[MLLMTestCaseParams],
27+
):
28+
g_eval_params = [G_EVAL_PARAMS[param] for param in mllm_test_case_params]
29+
if len(g_eval_params) == 1:
30+
g_eval_params_str = g_eval_params[0]
31+
elif len(g_eval_params) == 2:
32+
g_eval_params_str = " and ".join(g_eval_params)
33+
else:
34+
g_eval_params_str = (
35+
", ".join(g_eval_params[:-1]) + ", and " + g_eval_params[-1]
36+
)
37+
38+
return g_eval_params_str
39+
40+
def construct_test_case_list(
41+
evaluation_params: List[MLLMTestCaseParams], test_case: MLLMTestCase
42+
) -> List[Union[str, MLLMImage]]:
43+
test_case_list = []
44+
for param in evaluation_params:
45+
test_case_param_list = [f"\n\n\n{G_EVAL_PARAMS[param]}:\n"]
46+
value = getattr(test_case, param.value)
47+
for v in value:
48+
if isinstance(v, ToolCall):
49+
test_case_param_list.append(repr(v))
50+
else:
51+
test_case_param_list.append(v)
52+
test_case_list.extend(test_case_param_list)
53+
return test_case_list
54+
55+
def no_multimodal_log_prob_support(model: Union[str, DeepEvalBaseMLLM]):
56+
if isinstance(model, str) and model in unsupported_log_probs_gpt_models:
57+
return True
58+
elif (
59+
isinstance(model, MultimodalOpenAIModel)
60+
and model.model_name in unsupported_log_probs_gpt_models
61+
):
62+
return True
63+
return False

‎deepeval/models/llms/openai_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def __init__(
179179
super().__init__(model_name)
180180

181181
###############################################
182-
# Other generate functions
182+
# Generate functions
183183
###############################################
184184

185185
@retry(

‎deepeval/models/mlllms/openai_model.py

Lines changed: 69 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Optional, Tuple, List, Union
22
from openai import OpenAI, AsyncOpenAI
3+
from openai.types.chat import ParsedChatCompletion
34
from pydantic import BaseModel
45
from io import BytesIO
56
import logging
@@ -63,15 +64,6 @@ def log_retry_error(retry_state: RetryCallState):
6364
"gpt-4o-2024-11-20",
6465
"gpt-4o-mini",
6566
"gpt-4o-mini-2024-07-18",
66-
"gpt-4",
67-
"gpt-4-0125-preview",
68-
"gpt-4-0613",
69-
"gpt-4-1106-preview",
70-
"gpt-4-turbo",
71-
"gpt-4-turbo-2024-04-09",
72-
"gpt-4-turbo-preview",
73-
"gpt-4-32k",
74-
"gpt-4-32k-0613",
7567
]
7668

7769
default_multimodal_gpt_model = "gpt-4o"
@@ -101,6 +93,10 @@ def __init__(
10193

10294
super().__init__(model_name, *args, **kwargs)
10395

96+
###############################################
97+
# Generate functions
98+
###############################################
99+
104100
@retry(
105101
wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
106102
retry=retry_if_exception_type(retryable_exceptions),
@@ -113,9 +109,10 @@ def generate(
113109
) -> Tuple[str, float]:
114110
client = OpenAI(api_key=self._openai_api_key)
115111
prompt = self.generate_prompt(multimodal_input)
112+
messages = [{"role": "user", "content": prompt}]
116113
response = client.beta.chat.completions.parse(
117114
model=self.model_name,
118-
messages=[{"role": "user", "content": prompt}],
115+
messages=messages,
119116
response_format=schema,
120117
)
121118
input_tokens = response.usage.prompt_tokens
@@ -136,9 +133,10 @@ async def a_generate(
136133
) -> Tuple[str, float]:
137134
client = AsyncOpenAI(api_key=self._openai_api_key)
138135
prompt = self.generate_prompt(multimodal_input)
136+
messages = [{"role": "user", "content": prompt}]
139137
response = await client.beta.chat.completions.parse(
140138
model=self.model_name,
141-
messages=[{"role": "user", "content": prompt}],
139+
messages=messages,
142140
response_format=schema,
143141
)
144142
input_tokens = response.usage.prompt_tokens
@@ -147,6 +145,64 @@ async def a_generate(
147145
generated_text = response.choices[0].message.parsed
148146
return generated_text, total_cost
149147

148+
###############################################
149+
# Other generate functions
150+
###############################################
151+
152+
@retry(
153+
wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
154+
retry=retry_if_exception_type(retryable_exceptions),
155+
after=log_retry_error,
156+
)
157+
def generate_raw_response(
158+
self,
159+
multimodal_input: List[Union[str, MLLMImage]],
160+
top_logprobs: int = 5,
161+
) -> Tuple[ParsedChatCompletion, float]:
162+
client = OpenAI(api_key=self._openai_api_key)
163+
prompt = self.generate_prompt(multimodal_input)
164+
messages = [{"role": "user", "content": prompt}]
165+
completion = client.chat.completions.create(
166+
model=self.model_name,
167+
messages=messages,
168+
logprobs=True,
169+
top_logprobs=top_logprobs,
170+
)
171+
# Cost calculation
172+
input_tokens = completion.usage.prompt_tokens
173+
output_tokens = completion.usage.completion_tokens
174+
cost = self.calculate_cost(input_tokens, output_tokens)
175+
return completion, cost
176+
177+
@retry(
178+
wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
179+
retry=retry_if_exception_type(retryable_exceptions),
180+
after=log_retry_error,
181+
)
182+
async def a_generate_raw_response(
183+
self,
184+
multimodal_input: List[Union[str, MLLMImage]],
185+
top_logprobs: int = 5,
186+
) -> Tuple[ParsedChatCompletion, float]:
187+
client = AsyncOpenAI(api_key=self._openai_api_key)
188+
prompt = self.generate_prompt(multimodal_input)
189+
messages = [{"role": "user", "content": prompt}]
190+
completion = await client.chat.completions.create(
191+
model=self.model_name,
192+
messages=messages,
193+
logprobs=True,
194+
top_logprobs=top_logprobs,
195+
)
196+
# Cost calculation
197+
input_tokens = completion.usage.prompt_tokens
198+
output_tokens = completion.usage.completion_tokens
199+
cost = self.calculate_cost(input_tokens, output_tokens)
200+
return completion, cost
201+
202+
###############################################
203+
# Utilities
204+
###############################################
205+
150206
def generate_prompt(
151207
self, multimodal_input: List[Union[str, MLLMImage]] = []
152208
):
@@ -173,10 +229,6 @@ def generate_prompt(
173229
prompt.append(visual_dict)
174230
return prompt
175231

176-
###############################################
177-
# Utilities
178-
###############################################
179-
180232
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
181233
pricing = model_pricing.get(
182234
self.model_name, model_pricing["gpt-4o"]
@@ -187,6 +239,8 @@ def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
187239

188240
def encode_pil_image(self, pil_image):
189241
image_buffer = BytesIO()
242+
if pil_image.mode in ("RGBA", "LA", "P"):
243+
pil_image = pil_image.convert("RGB")
190244
pil_image.save(image_buffer, format="JPEG")
191245
image_bytes = image_buffer.getvalue()
192246
base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")

‎tests/test_image_metrics.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from deepeval.dataset import EvaluationDataset
44
from deepeval import evaluate
55
from deepeval.evaluate.configs import AsyncConfig, DisplayConfig
6-
from deepeval.test_case import MLLMTestCase, LLMTestCase, MLLMImage
6+
from deepeval.test_case import MLLMTestCase, LLMTestCase, MLLMImage, MLLMTestCaseParams, ToolCall
77
from deepeval.metrics import (
88
AnswerRelevancyMetric,
99
ImageEditingMetric,
@@ -13,6 +13,7 @@
1313
MultimodalContextualPrecisionMetric,
1414
MultimodalAnswerRelevancyMetric,
1515
MultimodalFaithfulnessMetric,
16+
MultimodalGEval
1617
)
1718

1819
image_path = "./data/image_1.jpg"
@@ -93,6 +94,8 @@ def multimodal_rag_case():
9394
url="https://upload.wikimedia.org/wikipedia/commons/thumb/0/03/Liberty-from-behind-2024.jpg/330px-Liberty-from-behind-2024.jpg"
9495
),
9596
],
97+
tools_called=[ToolCall(name="google search")],
98+
expected_tools=[ToolCall(name="google search")],
9699
)
97100

98101

@@ -128,6 +131,25 @@ def test_multimodal_rag_case(multimodal_rag_case):
128131
MultimodalContextualPrecisionMetric(),
129132
MultimodalAnswerRelevancyMetric(),
130133
MultimodalFaithfulnessMetric(),
134+
MultimodalGEval(
135+
name="MultimodalGEval",
136+
evaluation_params=[
137+
MLLMTestCaseParams.INPUT,
138+
MLLMTestCaseParams.ACTUAL_OUTPUT,
139+
MLLMTestCaseParams.EXPECTED_OUTPUT,
140+
MLLMTestCaseParams.CONTEXT,
141+
MLLMTestCaseParams.RETRIEVAL_CONTEXT,
142+
MLLMTestCaseParams.TOOLS_CALLED,
143+
MLLMTestCaseParams.EXPECTED_TOOLS,
144+
],
145+
model="gpt-4o",
146+
evaluation_steps=[
147+
"Determine if the output image follows the input instructions clearly.",
148+
"Determine if the expected output aligns with the actual output.",
149+
"Determine if the context is aligned with the retrieval context.",
150+
"Determine if the tools called are aligned with the expected tools.",
151+
],
152+
),
131153
],
132154
display_config=DisplayConfig(verbose_mode=True),
133155
async_config=AsyncConfig(run_async=False),

0 commit comments

Comments
 (0)
Please sign in to comment.