cohere-ai
diff --git a/‎README.md
Lines changed: 21 additions & 0 deletions b/‎README.md
Lines changed: 21 additions & 0 deletions
diff --git a/‎SEA/README.md
Lines changed: 3 additions & 0 deletions b/‎SEA/README.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎SEA/build/lib/deep_planner/__init__.py
Lines changed: 3 additions & 0 deletions b/‎SEA/build/lib/deep_planner/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎SEA/build/lib/deep_planner/compiler.py
Lines changed: 18 additions & 0 deletions b/‎SEA/build/lib/deep_planner/compiler.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎SEA/build/lib/deep_planner/deep_planner.py
Lines changed: 85 additions & 0 deletions b/‎SEA/build/lib/deep_planner/deep_planner.py
Lines changed: 85 additions & 0 deletions
diff --git a/‎SEA/build/lib/deep_planner/evaluator.py
Lines changed: 52 additions & 0 deletions b/‎SEA/build/lib/deep_planner/evaluator.py
Lines changed: 52 additions & 0 deletions
diff --git a/‎SEA/build/lib/deep_planner/prompts.py
Lines changed: 72 additions & 0 deletions b/‎SEA/build/lib/deep_planner/prompts.py
Lines changed: 72 additions & 0 deletions
diff --git a/‎SEA/deep_planner.egg-info/PKG-INFO
Lines changed: 6 additions & 0 deletions b/‎SEA/deep_planner.egg-info/PKG-INFO
Lines changed: 6 additions & 0 deletions
diff --git a/‎SEA/deep_planner.egg-info/SOURCES.txt
Lines changed: 11 additions & 0 deletions b/‎SEA/deep_planner.egg-info/SOURCES.txt
Lines changed: 11 additions & 0 deletions
diff --git a/‎SEA/deep_planner.egg-info/dependency_links.txt
Lines changed: 1 addition & 0 deletions b/‎SEA/deep_planner.egg-info/dependency_links.txt
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,21 @@
+# On the Efficacy of Self-reflection for Improving LLM Agent Planning
+
+This repository contains the code for our paper: "On the Efficacy of Self-reflection for Improving LLM Agent Planning".
+
+## Navigation
+The root of the repository contains the following sub-repositories:
+- `SEA/`: Contains the code for our self-relection framework, SEA (Sample-Evaluate-Aggregate).
+- `ToolTalk/`: Contains our version of the [ToolTalk benchmark repository](https://github.com/microsoft/ToolTalk).
+- `ToolSandbox/`: Contains our version of the [ToolSandbox benchmark repository](https://github.com/apple/ToolSandbox).
+
+
+## Details
+
+Both `ToolTalk` and `ToolSandbox` contain the original benchmark code with the necessary adaptations to run our experiments. 
+Both sub-repositories contain the necessary code to run the experiments with, instructions provided in the top the respective READMEs. Note the different environment settings will likely be required to run each benchmark.
+Additonally, the respective experimental results for each benchmark can be found in the `results` folder in the root of each sub-repository.
+
+
+## Citation
+```
+```
@@ -0,0 +1,3 @@
+# Sampling - Estimate - Aggregate (SEA) framework
+
+This repository contains the code for our LLM self-reflection framework, SEA (Sample-Evaluate-Aggregate).
@@ -0,0 +1,3 @@
+from .compiler import Compiler
+from .evaluator import Evaluator
+from .deep_planner import DeepPlanner
@@ -0,0 +1,18 @@
+
+from .prompts import COMPILER_PROMPT
+
+
+class Compiler():
+    def __init__(self, n_plans: int, generate_func: callable):
+        self.n_plans = n_plans
+        self.generate = generate_func
+
+        self.prompt = COMPILER_PROMPT(n_plans=n_plans)
+
+    def compile(self, plan_feedback: dict, previous_convervation_str: str, tools_string: str) -> str:
+        prompt = self.prompt.format(
+            tools_str=tools_string,
+            plans_and_feedback="\n\n".join([f"{plan}\nFeedback: {feedback}" for plan, feedback in plan_feedback.items()]),
+            previous_conversation=previous_convervation_str
+        )
+        return self.generate(prompt, 0)
@@ -0,0 +1,85 @@
+import json
+from deep_planner import Evaluator, Compiler
+
+from typing import List, Optional
+
+class DeepPlanner():
+    def __init__(
+            self, 
+            n_plans: int, 
+            temperature: float | List[float], 
+            plan_generator_func: callable, 
+            eval_generate_func: callable,
+            compile_generate_func: callable,
+            extract_plan_func: Optional[callable],
+            is_split_feedback: Optional[bool] = True,
+            is_printing: Optional[bool] = False
+        ):
+        self.n_plans = n_plans
+        self.temperature = temperature
+        self.generate_plan = plan_generator_func
+        self.extract_plan = extract_plan_func
+        self.evaluator = Evaluator(n_plans, eval_generate_func, is_split_feedback)
+        self.compiler = Compiler(n_plans, compile_generate_func)
+        self.is_printing = is_printing
+
+        if isinstance(temperature, list):
+            assert len(temperature) == n_plans, "Number of temperatures must match number of plans."
+
+
+    def print(self, *args, **kwargs):
+        if self.is_printing:
+            print(*args, **kwargs)
+
+    def sample_plans(self, prompt: Optional[str], messages: Optional[List[dict]]) -> List[str]:
+        plan_options = []
+
+        # Sample multiple unique plans
+        temperatures = self.temperature if isinstance(self.temperature, list) else [self.temperature for _ in range(self.n_plans)]
+        
+        while len(plan_options) < self.n_plans:
+            plan_temp = temperatures[len(plan_options)]
+            
+            if messages:
+                self.print(f"Generating plans at temperature={self.temperature} from message...")
+                plan = self.generate_plan(messages=messages, temperature=plan_temp)
+            else:
+                self.print(f"Generating plans at temperature={self.temperature} from prompt...")
+                plan = self.generate_plan(prompt=prompt, temperature=plan_temp)
+
+            if plan.strip() != "":
+                if self.extract_plan:
+                    plan = self.extract_plan(plan)
+
+                if plan not in plan_options:
+                    plan_options.append(plan)
+
+        return plan_options
+
+
+    def plan(
+            self, 
+            previous_conversation_str: str, 
+            tools_string: str,
+            planning_prompt: Optional[str] = None,  
+            planning_messages: Optional[List[dict]] = None
+        ) -> str:
+        assert planning_prompt is not None or planning_messages is not None, "Either planning_prompt or planning_messages must be provided."
+        assert planning_prompt is None or planning_messages is None, "Only one of planning_prompt or planning_messages should be provided."
+
+        plans = self.sample_plans(prompt=planning_prompt, messages=planning_messages)
+
+        self.print("Plans: ", plans)
+        if len(plans) < 1:
+            raise ValueError("No plans generated.")
+        
+        self.print("Evaluating plans...")
+        feedback = self.evaluator.evaluate(plans, previous_conversation_str, tools_string)
+        self.print("Plans and feedback: ", json.dumps(feedback))
+
+        self.print("Compiling final plan...")
+        final_plan = self.extract_plan(self.compiler.compile(feedback, previous_conversation_str, tools_string))
+        self.print("Compiled plan: ", final_plan)
+
+        return final_plan
+    
@@ -0,0 +1,52 @@
+from typing import List
+from .prompts import EVAL_PROMPT_TOOLS, EVAL_PROMPT_ADEQUACY, EVAL_PROMPT_BOTH
+
+class Evaluator():
+    def __init__(self, 
+                 n_plans: int,
+                 generate_func: callable,
+                 is_split_feedback: bool = True):
+        self.generate = generate_func
+        self.is_split_feedback = is_split_feedback
+        self.prompt_all = EVAL_PROMPT_BOTH(n_plans)
+        self.prompt_tools = EVAL_PROMPT_TOOLS(n_plans)
+        self.prompt_adequacy = EVAL_PROMPT_ADEQUACY(n_plans) 
+
+
+    def evaluate(
+            self, 
+            plan_options: List[str], 
+            previous_convervation_str: str, 
+            tools_string: str
+        ) -> str:
+        feedback = {}
+        plan_options = [f"Plan {i+1}: " + plan.strip() for i, plan in enumerate(plan_options)]
+        for i, plan in enumerate(plan_options):
+            other_plans = plan_options[:i] + plan_options[i+1:]
+            other_plans = "\n\n".join(other_plans)
+
+            if not self.is_split_feedback:
+                prompt = self.prompt.format(
+                    tools_str=self.tools_string,
+                    previous_conversation=previous_convervation_str,
+                    alternative_plans=other_plans,
+                    plan=plan
+                )
+                feedback[plan] = self.generate(prompt, 0.0)
+            else:
+                prompt_tools = self.prompt_tools.format(
+                    tools_str=tools_string,
+                    previous_conversation=previous_convervation_str,
+                    alternative_plans=other_plans,
+                    plan=plan
+                )
+                prompt_adequacy = self.prompt_adequacy.format(
+                    tools_str=tools_string,
+                    previous_conversation=previous_convervation_str,
+                    alternative_plans=other_plans,
+                    plan=plan
+                )
+                feedback[plan] = f"{self.generate(prompt_tools, 0.0)}\n{self.generate(prompt_adequacy, 0.0)}"
+            
+        return feedback
+
@@ -0,0 +1,72 @@
+COMPILER_PROMPT = lambda n_plans:  "".join((
+    "You are tasked with compiling a final plan to help an AI assistant solve a user query.",
+    f"\nTo do this, you are provided with {n_plans} plan options, each of which is accompanied by a quality evaluation in the form of feedback." if n_plans > 1 else "\nTo do this, you are provided with an original plan, accompanied by a quality evaluation in the form of feedback.",
+    "\nAdditionally, you are provided with a list of available tools specifications and the previous conversation between user and assistant which should be used to inform the final plan.",
+    "\nYou should consider the feedback provided for each plan and either select or compile a final plan that is most likely to resolve the user's query." if n_plans > 1 else "\nYou should consider the feedback provided for the original plan and compile a final plan that is most likely to resolve the user's query.",
+    "\nThe final plan should be clear and concise, with each step using exactly one tool that is necessary for solving the user query without any additional input required the user. Each step must also have a variable #V indicating the tool (from only those available) to be used and the parameters to be passed to the tool.",
+    "\nDo not make use of tools that are not available. The final plan should only include interaction with the user if it is necessary for obtaining tool parameter values. If user is required for the parameters of the first tool to be used, the entire plan should be to ask the user for the necessary information.",
+    "\nRespond with only the final plan.",
+    "\n\nAvailable tools:",
+    "\n{tools_str}"
+    "\n\nPrevious conversation:",
+    "\n{previous_conversation}",
+    "\n\nPlans and feedback:" if n_plans > 1 else "Plan and feedback:",
+    "\n{plans_and_feedback}",
+    "\n\nFinal plan:\n",
+))
+
+
+EVAL_PROMPT_BOTH = lambda n_plans: "".join((
+    "You are tasked evaluating a plan to help an AI assistant solve a user query.",
+    "\nPlans may be given in structured step-by-step format, with each step being accompanied by a variable #V indicating the tool to be used and the parameters to be passed to the tool.",
+    "\nYou should provide feedback on: 1) the efficacy of the plan in solving the user query (without any addition user input), 2) the availability of the tools selected, and 3) and the correctness of tool parameters specified.",
+    "\nTo help you, you are provided with a list of available tools (and their specifications)" + (f", {n_plans-1} alternative plans for for comparison," if n_plans > 1 else "") + " and the previous conversation between the user and the AI assistant.",
+    "\nRespond with only the plan feedback.",
+    "\n\nAvailable tools:",
+    "\n{tools_str}"
+    "\n\nPrevious conversation:",
+    "\n{previous_conversation}",
+    "\n\nAlternative plans:" if n_plans > 1 else "",
+    "\n{alternative_plans}" if n_plans > 1 else "",
+    "\n\nPlan to evaluate:",
+    "\n{plan}"
+    "\n\nFeedback:\n",
+))
+
+
+EVAL_PROMPT_TOOLS = lambda n_plans: "".join((
+    "You are tasked evaluating a plan to help an AI assistant solve a user query.",
+    "\nPlans may be given in structured step-by-step format, with each step being accompanied by a variable #V indicating the tool to be used and the parameters to be passed to the tool.",
+    "\nYou should provide feedback both on the availability of the tools selected and the correctness of parameters specified.",
+    "\nTo help you, you are provided with a list of available tools (and their specifications)" + (f", {n_plans-1} alternative plans for for comparison," if n_plans > 1 else "") + " and the previous conversation between the user and the AI assistant.",
+    "\nSpecifically, you should check that the tools used are available and that the parameters are correctly specified (i.e., both in the correct format and derived directly from the previous conversation without the use of placeholders or requiring any additional user input).",
+    "\nRespond with only the plan feedback.",
+    "\n\nAvailable tools:",
+    "\n{tools_str}"
+    "\n\nPrevious conversation:",
+    "\n{previous_conversation}",
+    "\n\nAlternative plans:" if n_plans > 1 else "",
+    "\n{alternative_plans}" if n_plans > 1 else "",
+    "\n\nPlan to evaluate:",
+    "\n{plan}"
+    "\n\nFeedback:\n",
+))
+
+
+EVAL_PROMPT_ADEQUACY = lambda n_plans: "".join((
+    "You are tasked evaluating a plan to help an AI assistant solve a user query.",
+    "\nPlans may be given in structured step-by-step format, with each step being accompanied by a variable #V indicating the tool to be used and the parameters to be passed to the tool.",
+    "\nYou should provide feedback on the efficacy of the plan in solving the user query.",
+    "\nTo help you, you are provided with a list of available tools (and their specifications)" + (f", {n_plans-1} alternative plans for for comparison," if n_plans > 1 else "") + " and the previous conversation between the user and the AI assistant.",
+    "\nSpecifically, you should check that each step is necessary and makes use of the most suitable tools for solving the user query without any additional input required the user."
+    "\nRespond with only the plan feedback.",
+    "\n\nAvailable tools:",
+    "\n{tools_str}"
+    "\n\nPrevious conversation:",
+    "\n{previous_conversation}",
+    "\n\nAlternative plans:" if n_plans > 1 else "",
+    "\n{alternative_plans}" if n_plans > 1 else "",
+    "\n\nPlan to evaluate:",
+    "\n{plan}"
+    "\n\nFeedback:\n",
+))
@@ -0,0 +1,6 @@
+Metadata-Version: 2.1
+Name: deep_planner
+Version: 0.1
+Summary: A package for self-reflective planning with LLMs.
+Author: Tomas Goldsack
+Author-email: [email protected]
@@ -0,0 +1,11 @@
+README.md
+setup.py
+deep_planner/__init__.py
+deep_planner/compiler.py
+deep_planner/deep_planner.py
+deep_planner/evaluator.py
+deep_planner/prompts.py
+deep_planner.egg-info/PKG-INFO
+deep_planner.egg-info/SOURCES.txt
+deep_planner.egg-info/dependency_links.txt
+deep_planner.egg-info/top_level.txt
@@ -0,0 +1 @@
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Sampling - Estimate - Aggregate (SEA) framework`
	`2`	`+`
	`3`	`+This repository contains the code for our LLM self-reflection framework, SEA (Sample-Evaluate-Aggregate).`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .compiler import Compiler`
	`2`	`+from .evaluator import Evaluator`
	`3`	`+from .deep_planner import DeepPlanner`