Skip to content

Commit 2d40d9b

Browse files
committed
initial commit
0 parents  commit 2d40d9b

File tree

303 files changed

+61297
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

303 files changed

+61297
-0
lines changed

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# On the Efficacy of Self-reflection for Improving LLM Agent Planning
2+
3+
This repository contains the code for our paper: "On the Efficacy of Self-reflection for Improving LLM Agent Planning".
4+
5+
## Navigation
6+
The root of the repository contains the following sub-repositories:
7+
- `SEA/`: Contains the code for our self-relection framework, SEA (Sample-Evaluate-Aggregate).
8+
- `ToolTalk/`: Contains our version of the [ToolTalk benchmark repository](https://github.com/microsoft/ToolTalk).
9+
- `ToolSandbox/`: Contains our version of the [ToolSandbox benchmark repository](https://github.com/apple/ToolSandbox).
10+
11+
12+
## Details
13+
14+
Both `ToolTalk` and `ToolSandbox` contain the original benchmark code with the necessary adaptations to run our experiments.
15+
Both sub-repositories contain the necessary code to run the experiments with, instructions provided in the top the respective READMEs. Note the different environment settings will likely be required to run each benchmark.
16+
Additonally, the respective experimental results for each benchmark can be found in the `results` folder in the root of each sub-repository.
17+
18+
19+
## Citation
20+
```
21+
```

SEA/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Sampling - Estimate - Aggregate (SEA) framework
2+
3+
This repository contains the code for our LLM self-reflection framework, SEA (Sample-Evaluate-Aggregate).
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .compiler import Compiler
2+
from .evaluator import Evaluator
3+
from .deep_planner import DeepPlanner
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
2+
from .prompts import COMPILER_PROMPT
3+
4+
5+
class Compiler():
6+
def __init__(self, n_plans: int, generate_func: callable):
7+
self.n_plans = n_plans
8+
self.generate = generate_func
9+
10+
self.prompt = COMPILER_PROMPT(n_plans=n_plans)
11+
12+
def compile(self, plan_feedback: dict, previous_convervation_str: str, tools_string: str) -> str:
13+
prompt = self.prompt.format(
14+
tools_str=tools_string,
15+
plans_and_feedback="\n\n".join([f"{plan}\nFeedback: {feedback}" for plan, feedback in plan_feedback.items()]),
16+
previous_conversation=previous_convervation_str
17+
)
18+
return self.generate(prompt, 0)
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import json
2+
from deep_planner import Evaluator, Compiler
3+
4+
from typing import List, Optional
5+
6+
class DeepPlanner():
7+
def __init__(
8+
self,
9+
n_plans: int,
10+
temperature: float | List[float],
11+
plan_generator_func: callable,
12+
eval_generate_func: callable,
13+
compile_generate_func: callable,
14+
extract_plan_func: Optional[callable],
15+
is_split_feedback: Optional[bool] = True,
16+
is_printing: Optional[bool] = False
17+
):
18+
self.n_plans = n_plans
19+
self.temperature = temperature
20+
self.generate_plan = plan_generator_func
21+
self.extract_plan = extract_plan_func
22+
self.evaluator = Evaluator(n_plans, eval_generate_func, is_split_feedback)
23+
self.compiler = Compiler(n_plans, compile_generate_func)
24+
self.is_printing = is_printing
25+
26+
if isinstance(temperature, list):
27+
assert len(temperature) == n_plans, "Number of temperatures must match number of plans."
28+
29+
30+
def print(self, *args, **kwargs):
31+
if self.is_printing:
32+
print(*args, **kwargs)
33+
34+
def sample_plans(self, prompt: Optional[str], messages: Optional[List[dict]]) -> List[str]:
35+
plan_options = []
36+
37+
# Sample multiple unique plans
38+
temperatures = self.temperature if isinstance(self.temperature, list) else [self.temperature for _ in range(self.n_plans)]
39+
40+
while len(plan_options) < self.n_plans:
41+
plan_temp = temperatures[len(plan_options)]
42+
43+
if messages:
44+
self.print(f"Generating plans at temperature={self.temperature} from message...")
45+
plan = self.generate_plan(messages=messages, temperature=plan_temp)
46+
else:
47+
self.print(f"Generating plans at temperature={self.temperature} from prompt...")
48+
plan = self.generate_plan(prompt=prompt, temperature=plan_temp)
49+
50+
if plan.strip() != "":
51+
if self.extract_plan:
52+
plan = self.extract_plan(plan)
53+
54+
if plan not in plan_options:
55+
plan_options.append(plan)
56+
57+
return plan_options
58+
59+
60+
def plan(
61+
self,
62+
previous_conversation_str: str,
63+
tools_string: str,
64+
planning_prompt: Optional[str] = None,
65+
planning_messages: Optional[List[dict]] = None
66+
) -> str:
67+
assert planning_prompt is not None or planning_messages is not None, "Either planning_prompt or planning_messages must be provided."
68+
assert planning_prompt is None or planning_messages is None, "Only one of planning_prompt or planning_messages should be provided."
69+
70+
plans = self.sample_plans(prompt=planning_prompt, messages=planning_messages)
71+
72+
self.print("Plans: ", plans)
73+
if len(plans) < 1:
74+
raise ValueError("No plans generated.")
75+
76+
self.print("Evaluating plans...")
77+
feedback = self.evaluator.evaluate(plans, previous_conversation_str, tools_string)
78+
self.print("Plans and feedback: ", json.dumps(feedback))
79+
80+
self.print("Compiling final plan...")
81+
final_plan = self.extract_plan(self.compiler.compile(feedback, previous_conversation_str, tools_string))
82+
self.print("Compiled plan: ", final_plan)
83+
84+
return final_plan
85+
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from typing import List
2+
from .prompts import EVAL_PROMPT_TOOLS, EVAL_PROMPT_ADEQUACY, EVAL_PROMPT_BOTH
3+
4+
class Evaluator():
5+
def __init__(self,
6+
n_plans: int,
7+
generate_func: callable,
8+
is_split_feedback: bool = True):
9+
self.generate = generate_func
10+
self.is_split_feedback = is_split_feedback
11+
self.prompt_all = EVAL_PROMPT_BOTH(n_plans)
12+
self.prompt_tools = EVAL_PROMPT_TOOLS(n_plans)
13+
self.prompt_adequacy = EVAL_PROMPT_ADEQUACY(n_plans)
14+
15+
16+
def evaluate(
17+
self,
18+
plan_options: List[str],
19+
previous_convervation_str: str,
20+
tools_string: str
21+
) -> str:
22+
feedback = {}
23+
plan_options = [f"Plan {i+1}: " + plan.strip() for i, plan in enumerate(plan_options)]
24+
for i, plan in enumerate(plan_options):
25+
other_plans = plan_options[:i] + plan_options[i+1:]
26+
other_plans = "\n\n".join(other_plans)
27+
28+
if not self.is_split_feedback:
29+
prompt = self.prompt.format(
30+
tools_str=self.tools_string,
31+
previous_conversation=previous_convervation_str,
32+
alternative_plans=other_plans,
33+
plan=plan
34+
)
35+
feedback[plan] = self.generate(prompt, 0.0)
36+
else:
37+
prompt_tools = self.prompt_tools.format(
38+
tools_str=tools_string,
39+
previous_conversation=previous_convervation_str,
40+
alternative_plans=other_plans,
41+
plan=plan
42+
)
43+
prompt_adequacy = self.prompt_adequacy.format(
44+
tools_str=tools_string,
45+
previous_conversation=previous_convervation_str,
46+
alternative_plans=other_plans,
47+
plan=plan
48+
)
49+
feedback[plan] = f"{self.generate(prompt_tools, 0.0)}\n{self.generate(prompt_adequacy, 0.0)}"
50+
51+
return feedback
52+

SEA/build/lib/deep_planner/prompts.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
COMPILER_PROMPT = lambda n_plans: "".join((
2+
"You are tasked with compiling a final plan to help an AI assistant solve a user query.",
3+
f"\nTo do this, you are provided with {n_plans} plan options, each of which is accompanied by a quality evaluation in the form of feedback." if n_plans > 1 else "\nTo do this, you are provided with an original plan, accompanied by a quality evaluation in the form of feedback.",
4+
"\nAdditionally, you are provided with a list of available tools specifications and the previous conversation between user and assistant which should be used to inform the final plan.",
5+
"\nYou should consider the feedback provided for each plan and either select or compile a final plan that is most likely to resolve the user's query." if n_plans > 1 else "\nYou should consider the feedback provided for the original plan and compile a final plan that is most likely to resolve the user's query.",
6+
"\nThe final plan should be clear and concise, with each step using exactly one tool that is necessary for solving the user query without any additional input required the user. Each step must also have a variable #V indicating the tool (from only those available) to be used and the parameters to be passed to the tool.",
7+
"\nDo not make use of tools that are not available. The final plan should only include interaction with the user if it is necessary for obtaining tool parameter values. If user is required for the parameters of the first tool to be used, the entire plan should be to ask the user for the necessary information.",
8+
"\nRespond with only the final plan.",
9+
"\n\nAvailable tools:",
10+
"\n{tools_str}"
11+
"\n\nPrevious conversation:",
12+
"\n{previous_conversation}",
13+
"\n\nPlans and feedback:" if n_plans > 1 else "Plan and feedback:",
14+
"\n{plans_and_feedback}",
15+
"\n\nFinal plan:\n",
16+
))
17+
18+
19+
EVAL_PROMPT_BOTH = lambda n_plans: "".join((
20+
"You are tasked evaluating a plan to help an AI assistant solve a user query.",
21+
"\nPlans may be given in structured step-by-step format, with each step being accompanied by a variable #V indicating the tool to be used and the parameters to be passed to the tool.",
22+
"\nYou should provide feedback on: 1) the efficacy of the plan in solving the user query (without any addition user input), 2) the availability of the tools selected, and 3) and the correctness of tool parameters specified.",
23+
"\nTo help you, you are provided with a list of available tools (and their specifications)" + (f", {n_plans-1} alternative plans for for comparison," if n_plans > 1 else "") + " and the previous conversation between the user and the AI assistant.",
24+
"\nRespond with only the plan feedback.",
25+
"\n\nAvailable tools:",
26+
"\n{tools_str}"
27+
"\n\nPrevious conversation:",
28+
"\n{previous_conversation}",
29+
"\n\nAlternative plans:" if n_plans > 1 else "",
30+
"\n{alternative_plans}" if n_plans > 1 else "",
31+
"\n\nPlan to evaluate:",
32+
"\n{plan}"
33+
"\n\nFeedback:\n",
34+
))
35+
36+
37+
EVAL_PROMPT_TOOLS = lambda n_plans: "".join((
38+
"You are tasked evaluating a plan to help an AI assistant solve a user query.",
39+
"\nPlans may be given in structured step-by-step format, with each step being accompanied by a variable #V indicating the tool to be used and the parameters to be passed to the tool.",
40+
"\nYou should provide feedback both on the availability of the tools selected and the correctness of parameters specified.",
41+
"\nTo help you, you are provided with a list of available tools (and their specifications)" + (f", {n_plans-1} alternative plans for for comparison," if n_plans > 1 else "") + " and the previous conversation between the user and the AI assistant.",
42+
"\nSpecifically, you should check that the tools used are available and that the parameters are correctly specified (i.e., both in the correct format and derived directly from the previous conversation without the use of placeholders or requiring any additional user input).",
43+
"\nRespond with only the plan feedback.",
44+
"\n\nAvailable tools:",
45+
"\n{tools_str}"
46+
"\n\nPrevious conversation:",
47+
"\n{previous_conversation}",
48+
"\n\nAlternative plans:" if n_plans > 1 else "",
49+
"\n{alternative_plans}" if n_plans > 1 else "",
50+
"\n\nPlan to evaluate:",
51+
"\n{plan}"
52+
"\n\nFeedback:\n",
53+
))
54+
55+
56+
EVAL_PROMPT_ADEQUACY = lambda n_plans: "".join((
57+
"You are tasked evaluating a plan to help an AI assistant solve a user query.",
58+
"\nPlans may be given in structured step-by-step format, with each step being accompanied by a variable #V indicating the tool to be used and the parameters to be passed to the tool.",
59+
"\nYou should provide feedback on the efficacy of the plan in solving the user query.",
60+
"\nTo help you, you are provided with a list of available tools (and their specifications)" + (f", {n_plans-1} alternative plans for for comparison," if n_plans > 1 else "") + " and the previous conversation between the user and the AI assistant.",
61+
"\nSpecifically, you should check that each step is necessary and makes use of the most suitable tools for solving the user query without any additional input required the user."
62+
"\nRespond with only the plan feedback.",
63+
"\n\nAvailable tools:",
64+
"\n{tools_str}"
65+
"\n\nPrevious conversation:",
66+
"\n{previous_conversation}",
67+
"\n\nAlternative plans:" if n_plans > 1 else "",
68+
"\n{alternative_plans}" if n_plans > 1 else "",
69+
"\n\nPlan to evaluate:",
70+
"\n{plan}"
71+
"\n\nFeedback:\n",
72+
))

SEA/deep_planner.egg-info/PKG-INFO

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Metadata-Version: 2.1
2+
Name: deep_planner
3+
Version: 0.1
4+
Summary: A package for self-reflective planning with LLMs.
5+
Author: Tomas Goldsack
6+
Author-email: [email protected]

SEA/deep_planner.egg-info/SOURCES.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
README.md
2+
setup.py
3+
deep_planner/__init__.py
4+
deep_planner/compiler.py
5+
deep_planner/deep_planner.py
6+
deep_planner/evaluator.py
7+
deep_planner/prompts.py
8+
deep_planner.egg-info/PKG-INFO
9+
deep_planner.egg-info/SOURCES.txt
10+
deep_planner.egg-info/dependency_links.txt
11+
deep_planner.egg-info/top_level.txt
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

0 commit comments

Comments
 (0)