Skip to content

Fixed alias #845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deepeval/benchmarks/big_bench_hard/big_bench_hard.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def load_benchmark_dataset(self, task: BigBenchHardTask) -> List[Golden]:

goldens: List[Golden] = []
for data in dataset["test"]:
golden = Golden(input=data["input"], expectedOutput=data["target"])
golden = Golden(input=data["input"], expected_output=data["target"])
goldens.append(golden)

return goldens
2 changes: 1 addition & 1 deletion deepeval/benchmarks/drop/drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def load_benchmark_dataset(self, task: DROPTask) -> List[Golden]:
output = DELIMITER.join(tuple(data["answers_spans"]["spans"][0]))
output_type = data["answers_spans"]["types"][0]
golden = Golden(
input=input, expectedOutput=output, context=[output_type]
input=input, expected_output=output, context=[output_type]
)
goldens.append(golden)

Expand Down
2 changes: 1 addition & 1 deletion deepeval/benchmarks/gsm8k/gsm8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def load_benchmark_dataset(self) -> List[Golden]:
for data in dataset["test"]:
input = data["question"]
output = GSM8KTemplate.format_answer(data)
golden = Golden(input=input, expectedOutput=output)
golden = Golden(input=input, expected_output=output)
goldens.append(golden)

return goldens
2 changes: 1 addition & 1 deletion deepeval/benchmarks/hellaswag/hellaswag.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def load_benchmark_dataset(self, task: HellaSwagTask) -> List[Golden]:
data, include_answer=False
)
golden = Golden(
input=input, expectedOutput=choices[int(data["label"])]
input=input, expected_output=choices[int(data["label"])]
)
goldens.append(golden)
return goldens
2 changes: 1 addition & 1 deletion deepeval/benchmarks/human_eval/human_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,6 @@ def load_benchmark_dataset(self, task: HumanEvalTask) -> List[Golden]:
)[0]
# Construct test set
golden = Golden(
input=test_set["prompt"], expectedOutput=test_set["test"]
input=test_set["prompt"], expected_output=test_set["test"]
)
return golden
2 changes: 1 addition & 1 deletion deepeval/benchmarks/mmlu/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,6 @@ def load_benchmark_dataset(self, task: MMLUTask) -> List[Golden]:
goldens: List[Golden] = []
for data in dataset["test"]:
input = MMLUTemplate.format_question(data, include_answer=False)
golden = Golden(input=input, expectedOutput=data["target"])
golden = Golden(input=input, expected_output=data["target"])
goldens.append(golden)
return goldens
4 changes: 2 additions & 2 deletions deepeval/benchmarks/truthful_qa/truthful_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,14 @@ def load_benchmark_dataset(
input, expected_output = TruthfulQATemplate.format_mc1_question(
data
)
golden = Golden(input=input, expectedOutput=expected_output)
golden = Golden(input=input, expected_output=expected_output)
goldens.append(golden)
elif mode == TruthfulQAMode.MC2:
input, expected_output = TruthfulQATemplate.format_mc2_question(
data
)
golden = Golden(
input=input, expectedOutput=str(expected_output)
input=input, expected_output=str(expected_output)
)
goldens.append(golden)

Expand Down
10 changes: 8 additions & 2 deletions deepeval/dataset/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,15 @@
from deepeval.dataset.golden import Golden, ConversationalGolden


def to_snake_case(string: str) -> str:
return "".join(
["_" + i.lower() if i.isupper() else i for i in string]
).lstrip("_")


class APIDataset(BaseModel):
alias: str
overwrite: bool
overwrite: Optional[bool] = None
goldens: Optional[List[Golden]] = Field(default=[])
conversational_goldens: Optional[List[ConversationalGolden]] = Field(
default=[], alias="conversationalGoldens"
Expand All @@ -18,7 +24,7 @@ class CreateDatasetHttpResponse(BaseModel):


class DatasetHttpResponse(BaseModel):
goldens: List[Golden]
goldens: List[Golden] = Field(alias="goldens")
conversational_goldens: List[ConversationalGolden] = Field(
alias="conversationalGoldens"
)
Expand Down
8 changes: 5 additions & 3 deletions deepeval/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
)
from deepeval.dataset.golden import Golden, ConversationalGolden
from deepeval.test_case import LLMTestCase, ConversationalTestCase
from deepeval.utils import is_confident
from deepeval.utils import convert_keys_to_snake_case, is_confident
from deepeval.synthesizer.base_synthesizer import BaseSynthesizer

valid_file_types = ["csv", "json"]
Expand Down Expand Up @@ -365,8 +365,10 @@ def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True):
)

response = DatasetHttpResponse(
goldens=result["goldens"],
conversationalGoldens=result["conversationalGoldens"],
goldens=convert_keys_to_snake_case(result["goldens"]),
conversationalGoldens=convert_keys_to_snake_case(
result["conversationalGoldens"]
),
datasetId=result["datasetId"],
)

Expand Down
28 changes: 21 additions & 7 deletions deepeval/dataset/golden.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,38 @@
from typing import Optional, Dict, List


def to_snake_case(string: str) -> str:
return "".join(
["_" + i.lower() if i.isupper() else i for i in string]
).lstrip("_")


class Golden(BaseModel):
input: str
actual_output: Optional[str] = Field(None, alias="actualOutput")
expected_output: Optional[str] = Field(None, alias="expectedOutput")
actual_output: Optional[str] = Field(
None, serialization_alias="actualOutput"
)
expected_output: Optional[str] = Field(
None, serialization_alias="expectedOutput"
)
context: Optional[List[str]] = Field(None)
retrieval_context: Optional[List[str]] = Field(
None, alias="retrievalContext"
None, serialization_alias="retrievalContext"
)
additional_metadata: Optional[Dict] = Field(
None, alias="additionalMetadata"
None, serialization_alias="additionalMetadata"
)
comments: Optional[str] = Field(None)
source_file: Optional[str] = Field(None, alias="sourceFile")
source_file: Optional[str] = Field(None, serialization_alias="sourceFile")


class ConversationalGolden(BaseModel):
additional_metadata: Optional[Dict] = Field(
None, alias="additionalMetadata"
None, serialization_alias="additionalMetadata"
)
comments: Optional[str] = Field(None)
messages: List[Golden] = Field(default_factory=lambda: [], alias="goldens")
messages: List[Golden] = Field(
default_factory=lambda: [],
validation_alias="goldens",
serialization_alias="goldens",
)
6 changes: 3 additions & 3 deletions deepeval/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ def convert_test_cases_to_goldens(
for test_case in test_cases:
golden = {
"input": test_case.input,
"actualOutput": test_case.actual_output,
"expectedOutput": test_case.expected_output,
"actual_output": test_case.actual_output,
"expected_output": test_case.expected_output,
"context": test_case.context,
"retrievalContext": test_case.retrieval_context,
"retrieval_context": test_case.retrieval_context,
}
goldens.append(Golden(**golden))
return goldens
Expand Down
6 changes: 3 additions & 3 deletions deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,12 @@ def create_api_test_case(
# to each individual message (test case)
test_case.additional_metadata = additional_metadata
test_case.comments = comments
traceStack = None
trace_stack = None
else:
success = True
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{index}")
order = test_case._dataset_rank
traceStack = get_trace_stack()
trace_stack = get_trace_stack()

return LLMApiTestCase(
name=name,
Expand All @@ -135,7 +135,7 @@ def create_api_test_case(
order=order,
additionalMetadata=test_case.additional_metadata,
comments=test_case.comments,
traceStack=traceStack,
traceStack=trace_stack,
)

elif isinstance(test_case, ConversationalTestCase):
Expand Down
3 changes: 1 addition & 2 deletions deepeval/integrations/hugging_face/tests/test_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,13 @@ def create_deepeval_dataset(dataset, sample_size):
random_index_list = [
random.randint(0, total_length) for _ in range(sample_size)
]
print(random_index_list)
eval_dataset = [dataset[row] for row in random_index_list]
goldens = []
for row in eval_dataset:
context = ["; ".join(row["context"]["contexts"])]
golden = Golden(
input=row["question"],
expectedOutput=row["long_answer"],
expected_output=row["long_answer"],
context=context,
retrieval_context=context,
)
Expand Down
12 changes: 7 additions & 5 deletions deepeval/integrations/llama_index/callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,12 @@ def create_trace_instance(
traces=[],
llmMetadata=LlmMetadata(
model=processed_payload["llm_model_name"],
outputMessages=None,
tokenCount=None,
promptTemplate=processed_payload.get("llm_prompt_template"),
promptTemplateVariables=processed_payload.get(
output_messages=None,
token_count=None,
prompt_template=processed_payload.get(
"llm_prompt_template"
),
prompt_template_variables=processed_payload.get(
"llm_prompt_template_variables"
),
),
Expand Down Expand Up @@ -224,7 +226,7 @@ def create_trace_instance(
traces=[],
rerankingMetadata=RerankingMetadata(
model=processed_payload["reranker_model_name"],
topK=processed_payload["reranker_top_k"],
top_k=processed_payload["reranker_top_k"],
),
)

Expand Down
5 changes: 5 additions & 0 deletions deepeval/metrics/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
_track: bool = True,
):
super.__init__()
self.threshold = threshold
self.model = model
self._track = _track
Expand Down Expand Up @@ -160,6 +161,7 @@ def __init__(
embeddings: Optional[Embeddings] = None,
_track: bool = True,
):
super.__init__()
self.threshold = threshold
self.model = model
self._track = _track
Expand Down Expand Up @@ -226,6 +228,7 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
_track: bool = True,
):
super.__init__()
self.threshold = threshold
self.model = model
self._track = _track
Expand Down Expand Up @@ -287,6 +290,7 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
_track: bool = True,
):
super.__init__()
self.threshold = threshold
self.model = model
self._track = _track
Expand Down Expand Up @@ -348,6 +352,7 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
embeddings: Optional[Embeddings] = None,
):
super.__init__()
self.threshold = threshold
self.model = model
if isinstance(model, str):
Expand Down
2 changes: 1 addition & 1 deletion deepeval/synthesizer/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def _generate_from_contexts(
source_files[index] if source_files is not None else None
)
golden = Golden(
input=evolved_input, context=context, sourceFile=source_file
input=evolved_input, context=context, source_file=source_file
)

if include_expected_output:
Expand Down
24 changes: 16 additions & 8 deletions deepeval/tracing/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,29 +55,37 @@ class TraceStatus(Enum):

class LlmMetadata(BaseModel):
model: Optional[str] = None
token_count: Optional[Dict[str, int]] = Field(None, alias="tokenCount")
token_count: Optional[Dict[str, int]] = Field(
None, serialization_alias="tokenCount"
)
output_messages: Optional[List[Dict[str, str]]] = Field(
None, alias="outputMessages"
None, serialization_alias="outputMessages"
)
prompt_template: Optional[Any] = Field(
None, serialization_alias="promptTemplate"
)
prompt_template: Optional[Any] = Field(None, alias="promptTemplate")
prompt_template_variables: Optional[Any] = Field(
None, alias="promptTemplateVariables"
None, serialization_alias="promptTemplateVariables"
)


class EmbeddingMetadata(BaseModel):
model: Optional[str] = None
vector_length: Optional[int] = Field(None, alias="vectorLength")
vector_length: Optional[int] = Field(
None, serialization_alias="vectorLength"
)


class RetrieverMetadata(BaseModel):
top_k: Optional[int] = Field(None, alias="topK")
average_chunk_size: Optional[int] = Field(None, alias="averageChunkSize")
top_k: Optional[int] = Field(None, serialization_alias="topK")
average_chunk_size: Optional[int] = Field(
None, serialization_alias="averageChunkSize"
)


class RerankingMetadata(BaseModel):
model: Optional[str] = None
top_k: Optional[int] = Field(None, alias="topK")
top_k: Optional[int] = Field(None, serialization_alias="topK")


@dataclass
Expand Down
17 changes: 17 additions & 0 deletions deepeval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@
from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER


def camel_to_snake(name: str) -> str:
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()


def convert_keys_to_snake_case(data: Any) -> Any:
if isinstance(data, dict):
return {
camel_to_snake(k): convert_keys_to_snake_case(v)
for k, v in data.items()
}
elif isinstance(data, list):
return [convert_keys_to_snake_case(i) for i in data]
else:
return data


def prettify_list(lst: List[Any]):
if len(lst) == 0:
return "[]"
Expand Down
1 change: 1 addition & 0 deletions tests/test_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
class FakeMetric(BaseMetric):
# This metric by default checks if the latency is greater than 10 seconds
def __init__(self, threshold: float = 0.5):
super().__init__()
self.threshold = threshold

def measure(self, test_case: LLMTestCase):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_everything.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"""

strict_mode = False
verbose_mode = True
verbose_mode = False


@pytest.mark.skip(reason="openai is expensive")
Expand All @@ -89,7 +89,7 @@ def test_everything():
threshold=0.1,
strict_mode=strict_mode,
async_mode=False,
verbose_mode=True,
verbose_mode=verbose_mode,
)
metric2 = FaithfulnessMetric(
threshold=0.5, strict_mode=strict_mode, verbose_mode=verbose_mode
Expand Down
Loading