Skip to content

Commit c9be01b

Browse files
committed
.
1 parent 4f3f1ee commit c9be01b

File tree

18 files changed

+63
-86
lines changed

18 files changed

+63
-86
lines changed

deepeval/benchmarks/big_bench_hard/big_bench_hard.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def load_benchmark_dataset(self, task: BigBenchHardTask) -> List[Golden]:
171171

172172
goldens: List[Golden] = []
173173
for data in dataset["test"]:
174-
golden = Golden(input=data["input"], expectedOutput=data["target"])
174+
golden = Golden(input=data["input"], expected_output=data["target"])
175175
goldens.append(golden)
176176

177177
return goldens

deepeval/benchmarks/drop/drop.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def load_benchmark_dataset(self, task: DROPTask) -> List[Golden]:
181181
output = DELIMITER.join(tuple(data["answers_spans"]["spans"][0]))
182182
output_type = data["answers_spans"]["types"][0]
183183
golden = Golden(
184-
input=input, expectedOutput=output, context=[output_type]
184+
input=input, expected_output=output, context=[output_type]
185185
)
186186
goldens.append(golden)
187187

deepeval/benchmarks/gsm8k/gsm8k.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def load_benchmark_dataset(self) -> List[Golden]:
9898
for data in dataset["test"]:
9999
input = data["question"]
100100
output = GSM8KTemplate.format_answer(data)
101-
golden = Golden(input=input, expectedOutput=output)
101+
golden = Golden(input=input, expected_output=output)
102102
goldens.append(golden)
103103

104104
return goldens

deepeval/benchmarks/hellaswag/hellaswag.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def load_benchmark_dataset(self, task: HellaSwagTask) -> List[Golden]:
192192
data, include_answer=False
193193
)
194194
golden = Golden(
195-
input=input, expectedOutput=choices[int(data["label"])]
195+
input=input, expected_output=choices[int(data["label"])]
196196
)
197197
goldens.append(golden)
198198
return goldens

deepeval/benchmarks/human_eval/human_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,6 @@ def load_benchmark_dataset(self, task: HumanEvalTask) -> List[Golden]:
118118
)[0]
119119
# Construct test set
120120
golden = Golden(
121-
input=test_set["prompt"], expectedOutput=test_set["test"]
121+
input=test_set["prompt"], expected_output=test_set["test"]
122122
)
123123
return golden

deepeval/benchmarks/mmlu/mmlu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,6 @@ def load_benchmark_dataset(self, task: MMLUTask) -> List[Golden]:
179179
goldens: List[Golden] = []
180180
for data in dataset["test"]:
181181
input = MMLUTemplate.format_question(data, include_answer=False)
182-
golden = Golden(input=input, expectedOutput=data["target"])
182+
golden = Golden(input=input, expected_output=data["target"])
183183
goldens.append(golden)
184184
return goldens

deepeval/benchmarks/truthful_qa/truthful_qa.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,14 +195,14 @@ def load_benchmark_dataset(
195195
input, expected_output = TruthfulQATemplate.format_mc1_question(
196196
data
197197
)
198-
golden = Golden(input=input, expectedOutput=expected_output)
198+
golden = Golden(input=input, expected_output=expected_output)
199199
goldens.append(golden)
200200
elif mode == TruthfulQAMode.MC2:
201201
input, expected_output = TruthfulQATemplate.format_mc2_question(
202202
data
203203
)
204204
golden = Golden(
205-
input=input, expectedOutput=str(expected_output)
205+
input=input, expected_output=str(expected_output)
206206
)
207207
goldens.append(golden)
208208

deepeval/dataset/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class APIDataset(BaseModel):
99
overwrite: bool
1010
goldens: Optional[List[Golden]] = Field(default=[])
1111
conversational_goldens: Optional[List[ConversationalGolden]] = Field(
12-
default=[], serialization_alias="conversationalGoldens"
12+
default=[], alias="conversationalGoldens"
1313
)
1414

1515

deepeval/dataset/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ def convert_test_cases_to_goldens(
1010
for test_case in test_cases:
1111
golden = {
1212
"input": test_case.input,
13-
"actualOutput": test_case.actual_output,
14-
"expectedOutput": test_case.expected_output,
13+
"actual_output": test_case.actual_output,
14+
"expected_output": test_case.expected_output,
1515
"context": test_case.context,
16-
"retrievalContext": test_case.retrieval_context,
16+
"retrieval_context": test_case.retrieval_context,
1717
}
1818
goldens.append(Golden(**golden))
1919
return goldens

deepeval/evaluate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,12 @@ def create_api_test_case(
114114
# to each individual message (test case)
115115
test_case.additional_metadata = additional_metadata
116116
test_case.comments = comments
117-
traceStack = None
117+
trace_stack = None
118118
else:
119119
success = True
120120
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{index}")
121121
order = test_case._dataset_rank
122-
traceStack = get_trace_stack()
122+
trace_stack = get_trace_stack()
123123

124124
return LLMApiTestCase(
125125
name=name,
@@ -135,7 +135,7 @@ def create_api_test_case(
135135
order=order,
136136
additionalMetadata=test_case.additional_metadata,
137137
comments=test_case.comments,
138-
traceStack=traceStack,
138+
traceStack=trace_stack,
139139
)
140140

141141
elif isinstance(test_case, ConversationalTestCase):

deepeval/event/api.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,28 +22,22 @@ class Config:
2222

2323

2424
class APIEvent(BaseModel):
25-
name: str = Field(..., serialization_alias="name")
25+
name: str = Field(..., alias="name")
2626
input: str
2727
response: str
2828
retrieval_context: Optional[List[str]] = Field(
29-
None, serialization_alias="retrievalContext"
30-
)
31-
completion_time: Optional[float] = Field(
32-
None, serialization_alias="completionTime"
33-
)
34-
token_usage: Optional[float] = Field(None, serialization_alias="tokenUsage")
35-
token_cost: Optional[float] = Field(None, serialization_alias="tokenCost")
36-
distinct_id: Optional[str] = Field(None, serialization_alias="distinctId")
37-
conversation_id: Optional[str] = Field(
38-
None, serialization_alias="conversationId"
29+
None, alias="retrievalContext"
3930
)
31+
completion_time: Optional[float] = Field(None, alias="completionTime")
32+
token_usage: Optional[float] = Field(None, alias="tokenUsage")
33+
token_cost: Optional[float] = Field(None, alias="tokenCost")
34+
distinct_id: Optional[str] = Field(None, alias="distinctId")
35+
conversation_id: Optional[str] = Field(None, alias="conversationId")
4036
custom_properties: Optional[Dict[str, CustomProperty]] = Field(
41-
None, serialization_alias="customProperties"
42-
)
43-
trace_stack: Optional[Dict] = Field(None, serialization_alias="traceStack")
44-
trace_provider: Optional[str] = Field(
45-
None, serialization_alias="traceProvider"
37+
None, alias="customProperties"
4638
)
39+
trace_stack: Optional[Dict] = Field(None, alias="traceStack")
40+
trace_provider: Optional[str] = Field(None, alias="traceProvider")
4741
hyperparameters: Optional[Dict] = Field(None)
4842

4943
class Config:

deepeval/event/feedback.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ def send_feedback(
2121
raise ValueError("'rating' must be between 1 and 5, inclusive.")
2222

2323
api_event = APIFeedback(
24-
eventId=event_id,
24+
event_id=event_id,
2525
provider=provider,
2626
rating=rating,
27-
expectedResponse=expected_response,
27+
expected_response=expected_response,
2828
explanation=explanation,
2929
)
3030
api = Api()

deepeval/integrations/hugging_face/tests/test_callbacks.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,13 @@ def create_deepeval_dataset(dataset, sample_size):
7676
random_index_list = [
7777
random.randint(0, total_length) for _ in range(sample_size)
7878
]
79-
print(random_index_list)
8079
eval_dataset = [dataset[row] for row in random_index_list]
8180
goldens = []
8281
for row in eval_dataset:
8382
context = ["; ".join(row["context"]["contexts"])]
8483
golden = Golden(
8584
input=row["question"],
86-
expectedOutput=row["long_answer"],
85+
expected_output=row["long_answer"],
8786
context=context,
8887
retrieval_context=context,
8988
)

deepeval/integrations/llama_index/callback.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,12 @@ def create_trace_instance(
175175
traces=[],
176176
llmMetadata=LlmMetadata(
177177
model=processed_payload["llm_model_name"],
178-
outputMessages=None,
179-
tokenCount=None,
180-
promptTemplate=processed_payload.get("llm_prompt_template"),
181-
promptTemplateVariables=processed_payload.get(
178+
output_messages=None,
179+
token_count=None,
180+
prompt_template=processed_payload.get(
181+
"llm_prompt_template"
182+
),
183+
prompt_template_variables=processed_payload.get(
182184
"llm_prompt_template_variables"
183185
),
184186
),
@@ -224,7 +226,7 @@ def create_trace_instance(
224226
traces=[],
225227
rerankingMetadata=RerankingMetadata(
226228
model=processed_payload["reranker_model_name"],
227-
topK=processed_payload["reranker_top_k"],
229+
top_k=processed_payload["reranker_top_k"],
228230
),
229231
)
230232

deepeval/synthesizer/synthesizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def _generate_from_contexts(
202202
source_files[index] if source_files is not None else None
203203
)
204204
golden = Golden(
205-
input=evolved_input, context=context, sourceFile=source_file
205+
input=evolved_input, context=context, source_file=source_file
206206
)
207207

208208
if include_expected_output:

deepeval/test_run/api.py

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,45 +8,33 @@ class MetricMetadata(BaseModel):
88
success: bool
99
score: Optional[float] = None
1010
reason: Optional[str] = None
11-
strict_mode: Optional[bool] = Field(False, serialization_alias="strictMode")
12-
evaluation_model: Optional[str] = Field(
13-
None, serialization_alias="evaluationModel"
14-
)
11+
strict_mode: Optional[bool] = Field(False, alias="strictMode")
12+
evaluation_model: Optional[str] = Field(None, alias="evaluationModel")
1513
error: Optional[str] = None
16-
evaluation_cost: Union[float, None] = Field(
17-
None, serialization_alias="evaluationCost"
18-
)
14+
evaluation_cost: Union[float, None] = Field(None, alias="evaluationCost")
1915

2016

2117
class LLMApiTestCase(BaseModel):
2218
name: str
2319
input: str
24-
actual_output: str = Field(..., serialization_alias="actualOutput")
25-
expected_output: Optional[str] = Field(
26-
None, serialization_alias="expectedOutput"
27-
)
20+
actual_output: str = Field(..., alias="actualOutput")
21+
expected_output: Optional[str] = Field(None, alias="expectedOutput")
2822
context: Optional[list] = Field(None)
29-
retrieval_context: Optional[list] = Field(
30-
None, serialization_alias="retrievalContext"
31-
)
23+
retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
3224
# make optional, not all test cases in a conversation will be evaluated
3325
success: Union[bool, None] = Field(None)
3426
# make optional, not all test cases in a conversation will be evaluated
3527
metrics_metadata: Union[List[MetricMetadata], None] = Field(
36-
None, serialization_alias="metricsMetadata"
28+
None, alias="metricsMetadata"
3729
)
3830
# make optional, not all test cases in a conversation will be evaluated
39-
run_duration: Union[float, None] = Field(
40-
None, serialization_alias="runDuration"
41-
)
31+
run_duration: Union[float, None] = Field(None, alias="runDuration")
4232
# make optional, not all test cases in a conversation will be evaluated
43-
evaluation_cost: Union[float, None] = Field(
44-
None, serialization_alias="evaluationCost"
45-
)
33+
evaluation_cost: Union[float, None] = Field(None, alias="evaluationCost")
4634
order: Union[int, None] = Field(None)
4735
# These should map 1 to 1 from golden
4836
additional_metadata: Optional[Dict] = Field(
49-
None, serialization_alias="additionalMetadata"
37+
None, alias="additionalMetadata"
5038
)
5139
comments: Optional[str] = Field(None)
5240
traceStack: Optional[dict] = Field(None)
@@ -80,14 +68,12 @@ class ConversationalApiTestCase(BaseModel):
8068
success: bool
8169
# metrics_metadata can be None when we're not evaluating using conversational metrics
8270
metrics_metadata: Union[List[MetricMetadata], None] = Field(
83-
None, serialization_alias="metricsMetadata"
84-
)
85-
run_duration: float = Field(0.0, serialization_alias="runDuration")
86-
evaluation_cost: Union[float, None] = Field(
87-
None, serialization_alias="evaluationCost"
71+
None, alias="metricsMetadata"
8872
)
73+
run_duration: float = Field(0.0, alias="runDuration")
74+
evaluation_cost: Union[float, None] = Field(None, alias="evaluationCost")
8975
messages: List[LLMApiTestCase] = Field(
90-
default_factory=lambda: [], serialization_alias="testCases"
76+
default_factory=lambda: [], alias="testCases"
9177
)
9278
order: Union[int, None] = Field(None)
9379

deepeval/test_run/test_run.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ class TestRun(BaseModel):
9191
)
9292
deployment: Optional[bool] = Field(True)
9393
deployment_configs: Optional[DeploymentConfigs] = Field(
94-
None, serialization_alias="deploymentConfigs"
94+
None, alias="deploymentConfigs"
9595
)
9696
test_cases: List[LLMApiTestCase] = Field(
9797
alias="testCases", default_factory=lambda: []
@@ -100,19 +100,15 @@ class TestRun(BaseModel):
100100
alias="conversationalTestCases", default_factory=lambda: []
101101
)
102102
metrics_scores: List[MetricScores] = Field(
103-
default_factory=lambda: [], serialization_alias="metricsScores"
103+
default_factory=lambda: [], alias="metricsScores"
104104
)
105105
hyperparameters: Optional[Dict[Any, Any]] = Field(None)
106-
test_passed: Optional[int] = Field(None, serialization_alias="testPassed")
107-
test_failed: Optional[int] = Field(None, serialization_alias="testFailed")
108-
run_duration: float = Field(0.0, serialization_alias="runDuration")
109-
evaluation_cost: Union[float, None] = Field(
110-
None, serialization_alias="evaluationCost"
111-
)
112-
dataset_alias: Optional[str] = Field(
113-
None, serialization_alias="datasetAlias"
114-
)
115-
dataset_id: Optional[str] = Field(None, serialization_alias="datasetId")
106+
test_passed: Optional[int] = Field(None, alias="testPassed")
107+
test_failed: Optional[int] = Field(None, alias="testFailed")
108+
run_duration: float = Field(0.0, alias="runDuration")
109+
evaluation_cost: Union[float, None] = Field(None, alias="evaluationCost")
110+
dataset_alias: Optional[str] = Field(None, alias="datasetAlias")
111+
dataset_id: Optional[str] = Field(None, alias="datasetId")
116112

117113
def add_test_case(
118114
self, api_test_case: Union[LLMApiTestCase, ConversationalApiTestCase]
@@ -557,8 +553,8 @@ def post_test_run(self, test_run: TestRun):
557553

558554
remaining_test_run = RemainingTestRun(
559555
testRunId=response.testRunId,
560-
testCases=test_case_batch,
561-
conversationalTestCases=conversational_batch,
556+
test_cases=test_case_batch,
557+
conversational_test_cases=conversational_batch,
562558
)
563559

564560
body = None

tests/test_everything.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
"""
8181

8282
strict_mode = False
83-
verbose_mode = True
83+
verbose_mode = False
8484

8585

8686
@pytest.mark.skip(reason="openai is expensive")
@@ -89,7 +89,7 @@ def test_everything():
8989
threshold=0.1,
9090
strict_mode=strict_mode,
9191
async_mode=False,
92-
verbose_mode=True,
92+
verbose_mode=verbose_mode,
9393
)
9494
metric2 = FaithfulnessMetric(
9595
threshold=0.5, strict_mode=strict_mode, verbose_mode=verbose_mode
@@ -167,7 +167,7 @@ def test_everything():
167167
)
168168

169169

170-
@pytest.mark.skip(reason="openadi is expensive")
170+
# @pytest.mark.skip(reason="openadi is expensive")
171171
def test_everything_2():
172172
metric1 = AnswerRelevancyMetric(threshold=0.5, strict_mode=strict_mode)
173173
metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode)

0 commit comments

Comments
 (0)