Skip to content

Commit ea7caee

Browse files
authored
Merge pull request #845 from confident-ai/hotfix/alias
Fixed alias
2 parents 9920598 + 9b56dda commit ea7caee

File tree

20 files changed

+98
-44
lines changed

20 files changed

+98
-44
lines changed

deepeval/benchmarks/big_bench_hard/big_bench_hard.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def load_benchmark_dataset(self, task: BigBenchHardTask) -> List[Golden]:
171171

172172
goldens: List[Golden] = []
173173
for data in dataset["test"]:
174-
golden = Golden(input=data["input"], expectedOutput=data["target"])
174+
golden = Golden(input=data["input"], expected_output=data["target"])
175175
goldens.append(golden)
176176

177177
return goldens

deepeval/benchmarks/drop/drop.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def load_benchmark_dataset(self, task: DROPTask) -> List[Golden]:
181181
output = DELIMITER.join(tuple(data["answers_spans"]["spans"][0]))
182182
output_type = data["answers_spans"]["types"][0]
183183
golden = Golden(
184-
input=input, expectedOutput=output, context=[output_type]
184+
input=input, expected_output=output, context=[output_type]
185185
)
186186
goldens.append(golden)
187187

deepeval/benchmarks/gsm8k/gsm8k.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def load_benchmark_dataset(self) -> List[Golden]:
9898
for data in dataset["test"]:
9999
input = data["question"]
100100
output = GSM8KTemplate.format_answer(data)
101-
golden = Golden(input=input, expectedOutput=output)
101+
golden = Golden(input=input, expected_output=output)
102102
goldens.append(golden)
103103

104104
return goldens

deepeval/benchmarks/hellaswag/hellaswag.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def load_benchmark_dataset(self, task: HellaSwagTask) -> List[Golden]:
192192
data, include_answer=False
193193
)
194194
golden = Golden(
195-
input=input, expectedOutput=choices[int(data["label"])]
195+
input=input, expected_output=choices[int(data["label"])]
196196
)
197197
goldens.append(golden)
198198
return goldens

deepeval/benchmarks/human_eval/human_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,6 @@ def load_benchmark_dataset(self, task: HumanEvalTask) -> List[Golden]:
118118
)[0]
119119
# Construct test set
120120
golden = Golden(
121-
input=test_set["prompt"], expectedOutput=test_set["test"]
121+
input=test_set["prompt"], expected_output=test_set["test"]
122122
)
123123
return golden

deepeval/benchmarks/mmlu/mmlu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,6 @@ def load_benchmark_dataset(self, task: MMLUTask) -> List[Golden]:
179179
goldens: List[Golden] = []
180180
for data in dataset["test"]:
181181
input = MMLUTemplate.format_question(data, include_answer=False)
182-
golden = Golden(input=input, expectedOutput=data["target"])
182+
golden = Golden(input=input, expected_output=data["target"])
183183
goldens.append(golden)
184184
return goldens

deepeval/benchmarks/truthful_qa/truthful_qa.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,14 +195,14 @@ def load_benchmark_dataset(
195195
input, expected_output = TruthfulQATemplate.format_mc1_question(
196196
data
197197
)
198-
golden = Golden(input=input, expectedOutput=expected_output)
198+
golden = Golden(input=input, expected_output=expected_output)
199199
goldens.append(golden)
200200
elif mode == TruthfulQAMode.MC2:
201201
input, expected_output = TruthfulQATemplate.format_mc2_question(
202202
data
203203
)
204204
golden = Golden(
205-
input=input, expectedOutput=str(expected_output)
205+
input=input, expected_output=str(expected_output)
206206
)
207207
goldens.append(golden)
208208

deepeval/dataset/api.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,15 @@
44
from deepeval.dataset.golden import Golden, ConversationalGolden
55

66

7+
def to_snake_case(string: str) -> str:
8+
return "".join(
9+
["_" + i.lower() if i.isupper() else i for i in string]
10+
).lstrip("_")
11+
12+
713
class APIDataset(BaseModel):
814
alias: str
9-
overwrite: bool
15+
overwrite: Optional[bool] = None
1016
goldens: Optional[List[Golden]] = Field(default=[])
1117
conversational_goldens: Optional[List[ConversationalGolden]] = Field(
1218
default=[], alias="conversationalGoldens"
@@ -18,7 +24,7 @@ class CreateDatasetHttpResponse(BaseModel):
1824

1925

2026
class DatasetHttpResponse(BaseModel):
21-
goldens: List[Golden]
27+
goldens: List[Golden] = Field(alias="goldens")
2228
conversational_goldens: List[ConversationalGolden] = Field(
2329
alias="conversationalGoldens"
2430
)

deepeval/dataset/dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
)
2424
from deepeval.dataset.golden import Golden, ConversationalGolden
2525
from deepeval.test_case import LLMTestCase, ConversationalTestCase
26-
from deepeval.utils import is_confident
26+
from deepeval.utils import convert_keys_to_snake_case, is_confident
2727
from deepeval.synthesizer.base_synthesizer import BaseSynthesizer
2828

2929
valid_file_types = ["csv", "json"]
@@ -365,8 +365,10 @@ def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True):
365365
)
366366

367367
response = DatasetHttpResponse(
368-
goldens=result["goldens"],
369-
conversationalGoldens=result["conversationalGoldens"],
368+
goldens=convert_keys_to_snake_case(result["goldens"]),
369+
conversationalGoldens=convert_keys_to_snake_case(
370+
result["conversationalGoldens"]
371+
),
370372
datasetId=result["datasetId"],
371373
)
372374

deepeval/dataset/golden.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,38 @@
22
from typing import Optional, Dict, List
33

44

5+
def to_snake_case(string: str) -> str:
6+
return "".join(
7+
["_" + i.lower() if i.isupper() else i for i in string]
8+
).lstrip("_")
9+
10+
511
class Golden(BaseModel):
612
input: str
7-
actual_output: Optional[str] = Field(None, alias="actualOutput")
8-
expected_output: Optional[str] = Field(None, alias="expectedOutput")
13+
actual_output: Optional[str] = Field(
14+
None, serialization_alias="actualOutput"
15+
)
16+
expected_output: Optional[str] = Field(
17+
None, serialization_alias="expectedOutput"
18+
)
919
context: Optional[List[str]] = Field(None)
1020
retrieval_context: Optional[List[str]] = Field(
11-
None, alias="retrievalContext"
21+
None, serialization_alias="retrievalContext"
1222
)
1323
additional_metadata: Optional[Dict] = Field(
14-
None, alias="additionalMetadata"
24+
None, serialization_alias="additionalMetadata"
1525
)
1626
comments: Optional[str] = Field(None)
17-
source_file: Optional[str] = Field(None, alias="sourceFile")
27+
source_file: Optional[str] = Field(None, serialization_alias="sourceFile")
1828

1929

2030
class ConversationalGolden(BaseModel):
2131
additional_metadata: Optional[Dict] = Field(
22-
None, alias="additionalMetadata"
32+
None, serialization_alias="additionalMetadata"
2333
)
2434
comments: Optional[str] = Field(None)
25-
messages: List[Golden] = Field(default_factory=lambda: [], alias="goldens")
35+
messages: List[Golden] = Field(
36+
default_factory=lambda: [],
37+
validation_alias="goldens",
38+
serialization_alias="goldens",
39+
)

deepeval/dataset/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ def convert_test_cases_to_goldens(
1010
for test_case in test_cases:
1111
golden = {
1212
"input": test_case.input,
13-
"actualOutput": test_case.actual_output,
14-
"expectedOutput": test_case.expected_output,
13+
"actual_output": test_case.actual_output,
14+
"expected_output": test_case.expected_output,
1515
"context": test_case.context,
16-
"retrievalContext": test_case.retrieval_context,
16+
"retrieval_context": test_case.retrieval_context,
1717
}
1818
goldens.append(Golden(**golden))
1919
return goldens

deepeval/evaluate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,12 @@ def create_api_test_case(
114114
# to each individual message (test case)
115115
test_case.additional_metadata = additional_metadata
116116
test_case.comments = comments
117-
traceStack = None
117+
trace_stack = None
118118
else:
119119
success = True
120120
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{index}")
121121
order = test_case._dataset_rank
122-
traceStack = get_trace_stack()
122+
trace_stack = get_trace_stack()
123123

124124
return LLMApiTestCase(
125125
name=name,
@@ -135,7 +135,7 @@ def create_api_test_case(
135135
order=order,
136136
additionalMetadata=test_case.additional_metadata,
137137
comments=test_case.comments,
138-
traceStack=traceStack,
138+
traceStack=trace_stack,
139139
)
140140

141141
elif isinstance(test_case, ConversationalTestCase):

deepeval/integrations/hugging_face/tests/test_callbacks.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,13 @@ def create_deepeval_dataset(dataset, sample_size):
7676
random_index_list = [
7777
random.randint(0, total_length) for _ in range(sample_size)
7878
]
79-
print(random_index_list)
8079
eval_dataset = [dataset[row] for row in random_index_list]
8180
goldens = []
8281
for row in eval_dataset:
8382
context = ["; ".join(row["context"]["contexts"])]
8483
golden = Golden(
8584
input=row["question"],
86-
expectedOutput=row["long_answer"],
85+
expected_output=row["long_answer"],
8786
context=context,
8887
retrieval_context=context,
8988
)

deepeval/integrations/llama_index/callback.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,12 @@ def create_trace_instance(
175175
traces=[],
176176
llmMetadata=LlmMetadata(
177177
model=processed_payload["llm_model_name"],
178-
outputMessages=None,
179-
tokenCount=None,
180-
promptTemplate=processed_payload.get("llm_prompt_template"),
181-
promptTemplateVariables=processed_payload.get(
178+
output_messages=None,
179+
token_count=None,
180+
prompt_template=processed_payload.get(
181+
"llm_prompt_template"
182+
),
183+
prompt_template_variables=processed_payload.get(
182184
"llm_prompt_template_variables"
183185
),
184186
),
@@ -224,7 +226,7 @@ def create_trace_instance(
224226
traces=[],
225227
rerankingMetadata=RerankingMetadata(
226228
model=processed_payload["reranker_model_name"],
227-
topK=processed_payload["reranker_top_k"],
229+
top_k=processed_payload["reranker_top_k"],
228230
),
229231
)
230232

deepeval/metrics/ragas.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def __init__(
9292
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
9393
_track: bool = True,
9494
):
95+
super.__init__()
9596
self.threshold = threshold
9697
self.model = model
9798
self._track = _track
@@ -160,6 +161,7 @@ def __init__(
160161
embeddings: Optional[Embeddings] = None,
161162
_track: bool = True,
162163
):
164+
super.__init__()
163165
self.threshold = threshold
164166
self.model = model
165167
self._track = _track
@@ -226,6 +228,7 @@ def __init__(
226228
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
227229
_track: bool = True,
228230
):
231+
super.__init__()
229232
self.threshold = threshold
230233
self.model = model
231234
self._track = _track
@@ -287,6 +290,7 @@ def __init__(
287290
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
288291
_track: bool = True,
289292
):
293+
super.__init__()
290294
self.threshold = threshold
291295
self.model = model
292296
self._track = _track
@@ -348,6 +352,7 @@ def __init__(
348352
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
349353
embeddings: Optional[Embeddings] = None,
350354
):
355+
super.__init__()
351356
self.threshold = threshold
352357
self.model = model
353358
if isinstance(model, str):

deepeval/synthesizer/synthesizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def _generate_from_contexts(
202202
source_files[index] if source_files is not None else None
203203
)
204204
golden = Golden(
205-
input=evolved_input, context=context, sourceFile=source_file
205+
input=evolved_input, context=context, source_file=source_file
206206
)
207207

208208
if include_expected_output:

deepeval/tracing/tracer.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,29 +55,37 @@ class TraceStatus(Enum):
5555

5656
class LlmMetadata(BaseModel):
5757
model: Optional[str] = None
58-
token_count: Optional[Dict[str, int]] = Field(None, alias="tokenCount")
58+
token_count: Optional[Dict[str, int]] = Field(
59+
None, serialization_alias="tokenCount"
60+
)
5961
output_messages: Optional[List[Dict[str, str]]] = Field(
60-
None, alias="outputMessages"
62+
None, serialization_alias="outputMessages"
63+
)
64+
prompt_template: Optional[Any] = Field(
65+
None, serialization_alias="promptTemplate"
6166
)
62-
prompt_template: Optional[Any] = Field(None, alias="promptTemplate")
6367
prompt_template_variables: Optional[Any] = Field(
64-
None, alias="promptTemplateVariables"
68+
None, serialization_alias="promptTemplateVariables"
6569
)
6670

6771

6872
class EmbeddingMetadata(BaseModel):
6973
model: Optional[str] = None
70-
vector_length: Optional[int] = Field(None, alias="vectorLength")
74+
vector_length: Optional[int] = Field(
75+
None, serialization_alias="vectorLength"
76+
)
7177

7278

7379
class RetrieverMetadata(BaseModel):
74-
top_k: Optional[int] = Field(None, alias="topK")
75-
average_chunk_size: Optional[int] = Field(None, alias="averageChunkSize")
80+
top_k: Optional[int] = Field(None, serialization_alias="topK")
81+
average_chunk_size: Optional[int] = Field(
82+
None, serialization_alias="averageChunkSize"
83+
)
7684

7785

7886
class RerankingMetadata(BaseModel):
7987
model: Optional[str] = None
80-
top_k: Optional[int] = Field(None, alias="topK")
88+
top_k: Optional[int] = Field(None, serialization_alias="topK")
8189

8290

8391
@dataclass

deepeval/utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,23 @@
2020
from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER
2121

2222

23+
def camel_to_snake(name: str) -> str:
24+
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
25+
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
26+
27+
28+
def convert_keys_to_snake_case(data: Any) -> Any:
29+
if isinstance(data, dict):
30+
return {
31+
camel_to_snake(k): convert_keys_to_snake_case(v)
32+
for k, v in data.items()
33+
}
34+
elif isinstance(data, list):
35+
return [convert_keys_to_snake_case(i) for i in data]
36+
else:
37+
return data
38+
39+
2340
def prettify_list(lst: List[Any]):
2441
if len(lst) == 0:
2542
return "[]"

tests/test_deployment.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
class FakeMetric(BaseMetric):
1212
# This metric by default checks if the latency is greater than 10 seconds
1313
def __init__(self, threshold: float = 0.5):
14+
super().__init__()
1415
self.threshold = threshold
1516

1617
def measure(self, test_case: LLMTestCase):

tests/test_everything.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
"""
8181

8282
strict_mode = False
83-
verbose_mode = True
83+
verbose_mode = False
8484

8585

8686
@pytest.mark.skip(reason="openai is expensive")
@@ -89,7 +89,7 @@ def test_everything():
8989
threshold=0.1,
9090
strict_mode=strict_mode,
9191
async_mode=False,
92-
verbose_mode=True,
92+
verbose_mode=verbose_mode,
9393
)
9494
metric2 = FaithfulnessMetric(
9595
threshold=0.5, strict_mode=strict_mode, verbose_mode=verbose_mode

0 commit comments

Comments
 (0)