Merge pull request #255 from confident-ai/feature/updatedocs

penguine-ip · web-flow · commit 9dc8b1c89da3 · 2023-10-27T03:28:59.000-07:00
Feature/updatedocs
diff --git a/docs/docs/evaluation-metrics.mdx b/docs/docs/evaluation-metrics.mdx
@@ -42,40 +42,30 @@ A custom LLM evalated metric, is a custom metric whose evaluation is powered by
 
 ```python
 from deepeval.metrics.llm_eval_metric import LLMEvalMetric
+from deepeval.types import LLMTestCaseParams
 
-funny_metric = LLMEvalMetric(
-    name="Funny",
-    criteria="How funny it is",
+summarization_metric = LLMEvalMetric(
+    name="Summarization",
+    criteria="Summarization - determine if the actual output is an accurate and concise summarization of the input.",
+    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
     minimum_score=0.5
 )
 ```
 
-There are two mandatory and two optional parameters required when instantiating an `LLMEvalMetric` class:
+There are three mandatory and one optional parameters required when instantiating an `LLMEvalMetric` class:
 
-- `name`
-- `criteria`
+- `name`: name of metric
+- `criteria`: a description outlining the specific evaluation aspects for each test case.
+- `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
 - [Optional] `minimum_score`
-- [Optional] `completion_function`
 
-All instances of `LLMEvalMetric` returns a score ranging from 0-1. A metric is only successful if the evaluation score is equal to or greater than `minimum_score`.
+All instances of `LLMEvalMetric` returns a score ranging from 0 - 1. A metric is only successful if the evaluation score is equal to or greater than `minimum_score`.
 
-:::info
-`LLMEvalMetric` may or may not not require `context` or `expected_output` supplied to `LLMTestCase`, but we recommend providing both arguments where possible for the most accurate evaluation.
+:::danger
+For accurate and valid results, only the parameters that are mentioned in `criteria` should be included as a member of `evaluation_params`.
 :::
 
-You can also supply a custom `completion_function` if for example you want to utilize another LLM provider to evaluate your `LLMTestCase`. By default, `deepeval` uses the `openai` chat completion function.
-
-```python
-def make_chat_completion_request(prompt: str):
-    response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": prompt},
-        ],
-    )
-    return response.choices[0].message.content
-```
+By defauly, `LLMEvalMetric` is evaluated using `GPT-4` from OpenAI.
 
 ## Custom Classic Metrics
 
diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx
@@ -103,14 +103,20 @@ from deepeval.metrics.llm_eval_metric import LLMEvalMetric
 
 ...
 
-def test_humor():
-    input = "What if these shoes don't fit?"
+def test_summarization():
+    input = "What if these shoes don't fit? I want a full refund."
 
-    # Replace this with the actual output of your LLM application
-    actual_output = "We offer a 30-day full refund at no extra cost."
-    funny_metric = LLMEvalMetric(name="Funny Metric", criteria="How funny it is", minimum_score=0.3)
+    # Replace this with the actual output from your LLM application
+    actual_output = "If the shoes don't fit, the customer wants a full refund."
+
+    summarization_metric = LLMEvalMetric(
+        name="Summarization",
+        criteria="Summarization - determine if the actual output is an accurate and concise summarization of the input.",
+        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
+        minimum_score=0.5
+    )
     test_case = LLMTestCase(input=input, actual_output=actual_output)
-    assert_test(test_case, [length_metric])
+    assert_test(test_case, [summarization_metric])
 ```
 
 ### Classic Metrics
@@ -181,10 +187,15 @@ def test_everything():
     actual_output = "We offer a 30-day full refund at no extra cost."
     factual_consistency_metric = FactualConsistencyMetric(minimum_score=0.7)
     length_metric = LengthMetric(max_length=10)
-    funny_metric = LLMEvalMetric(name="Funny Metric", criteria="How funny it is", minimum_score=0.3)
+    summarization_metric = LLMEvalMetric(
+        name="Summarization",
+        criteria="Summarization - determine if the actual output is an accurate and concise summarization of the input.",
+        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
+        minimum_score=0.5
+    )
 
     test_case = LLMTestCase(input=input, actual_output=actual_output, context=context)
-    assert_test(test_case, [factual_consistency_metric, length_metric, funny_metric])
+    assert_test(test_case, [factual_consistency_metric, length_metric, summarization_metric])
 ```
 
 In this scenario, `test_everything` only passes if all metrics are passing. Run `deepeval test run` again to see the results:
@@ -267,20 +278,16 @@ deepeval test run test_bulk.py
 
 If you have reached this point, you've likely ran `deepeval test run` multiple times. To keep track of all future evaluation results created by `deepeval`, login to **[Confident AI](https://app.confident-ai.com/auth/signup)** by running the following command:
 
-```
-
+```console
 deepeval login
-
 ```
 
 **Confident AI** is the platform powering `deepeval`, and offer deep insights to help you quickly figure out how to best implement your LLM application. Follow the instructions displayed on the CLI to create an account, get your Confident API key, and paste it in the CLI.
 
 Once you've pasted your Confident API key in the CLI, run:
 
-```
-
-deepeval test run test_examply.py
-
+```console
+deepeval test run test_example.py
 ```
 
 ### View Test Run
@@ -295,6 +302,33 @@ You can also view individual test cases for enhanced debugging:
 
 ![ok](https://d2lsxfc3p6r9rv.cloudfront.net/dashboard2.png)
 
+### Compare Hyperparameters
+
+To log hyperparameters (such as prompt templates used) for your LLM application, paste in the following code in `test_example.py`:
+
+```python title="test_example.py"
+import deepeval
+
+...
+
+@deepeval.set_hyperparameters
+def hyperparameters():
+    return {
+        "chunk_size": 500,
+        "temperature": 0,
+        "model": "GPT-4",
+        "prompt_template": """You are a helpful assistant, answer the following question in a non-judgemental tone.
+
+        Question:
+        {question}
+        """,
+    }
+```
+
+Execute `deepeval test run test_example.py` again to start comparing hyperparmeters for each test run.
+
+![ok](https://d2lsxfc3p6r9rv.cloudfront.net/dashboard3.png)
+
 ## Full Example
 
 You can find the full example [here on our Github](https://github.com/confident-ai/deepeval/blob/main/examples/getting_started/test_example.py).
diff --git a/examples/getting_started/test_example.py b/examples/getting_started/test_example.py
@@ -5,6 +5,7 @@
 from deepeval.metrics.llm_eval_metric import LLMEvalMetric
 from deepeval.types import LLMTestCaseParams
 from deepeval.metrics.base_metric import BaseMetric
+import deepeval
 
 
 def test_factual_consistency():
@@ -22,19 +23,23 @@ def test_factual_consistency():
     assert_test(test_case, [factual_consistency_metric])
 
 
-def test_humor():
-    input = "What if these shoes don't fit?"
+def test_summarization():
+    input = "What if these shoes don't fit? I want a full refund."
 
     # Replace this with the actual output from your LLM application
-    actual_output = "We offer a 30-day full refund at no extra cost."
-    funny_metric = LLMEvalMetric(
-        name="Funny Metric",
-        criteria="How funny the actual output is",
-        minimum_score=0.3,
-        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+    actual_output = "If the shoes don't fit, the customer wants a full refund."
+
+    summarization_metric = LLMEvalMetric(
+        name="Summarization",
+        criteria="Summarization - determine if the actual output is an accurate and concise summarization of the input.",
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ],
+        minimum_score=0.5,
     )
     test_case = LLMTestCase(input=input, actual_output=actual_output)
-    assert_test(test_case, [funny_metric])
+    assert_test(test_case, [summarization_metric])
 
 
 class LengthMetric(BaseMetric):
@@ -78,16 +83,32 @@ def test_everything():
     actual_output = "We offer a 30-day full refund at no extra cost."
     factual_consistency_metric = FactualConsistencyMetric(minimum_score=0.7)
     length_metric = LengthMetric(max_length=10)
-    funny_metric = LLMEvalMetric(
-        name="Funny Metric",
-        criteria="How funny it is",
-        minimum_score=0.3,
-        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+    summarization_metric = LLMEvalMetric(
+        name="Summarization",
+        criteria="Summarization - determine if the actual output is an accurate and concise summarization of the input.",
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ],
+        minimum_score=0.5,
     )
 
     test_case = LLMTestCase(
         input=input, actual_output=actual_output, context=context
     )
     assert_test(
-        test_case, [factual_consistency_metric, length_metric, funny_metric]
+        test_case,
+        [factual_consistency_metric, length_metric, summarization_metric],
     )
+
+
+@deepeval.set_hyperparameters
+def hyperparameters():
+    return {
+        "model": "GPT-4",
+        "prompt_template": """You are a helpful assistant, answer the following question in a non-judgemental tone.
+
+        Question:
+        {question}
+        """,
+    }