diff --git a/analyse_results.py b/analyse_results.py index 820800e..552865e 100644 --- a/analyse_results.py +++ b/analyse_results.py @@ -155,7 +155,7 @@ def aggregate_results(local_folders): success = int(extract_result(folder_path)) successful += success - if "missing" in folder_path: + if "missing" in folder_path and not is_base(folder_path): missing_successful += success missing_total += 1 if is_base(folder_path): diff --git a/evaluation_script.py b/evaluation_script.py index 2cf6152..7168b51 100644 --- a/evaluation_script.py +++ b/evaluation_script.py @@ -71,35 +71,8 @@ def check_task_completion(agents): except (FileNotFoundError, json.JSONDecodeError) as e: print(f"Error reading memory for agent {agent}: {e}") continue - return False # Default to failure if no conclusive result found -def update_results_file(task_id, success_count, total_count, time_taken, experiment_results, results_filename): - """Update the results file with current success ratio and time taken.""" - success_ratio = success_count / total_count - - with open(results_filename, 'w') as f: # 'w' mode overwrites the file each time - f.write(f"Task ID: {task_id}\n") - f.write(f"Experiments completed: {total_count}\n") - f.write(f"Successful experiments: {success_count}\n") - f.write(f"Success ratio: {success_ratio:.2f}\n") - f.write(f"Time taken for last experiment: {time_taken:.2f} seconds\n") - - # Write individual experiment results - for i, result in enumerate(experiment_results, 1): - f.write(f"Experiment {i}: {'Success' if result['success'] else 'Failure'}, Time taken: {result['time_taken']:.2f} seconds\n") - - # Write aggregated metrics - total_time = sum(result['time_taken'] for result in experiment_results) - f.write(f"\nAggregated metrics:\n") - f.write(f"Total experiments: {total_count}\n") - f.write(f"Total successful experiments: {success_count}\n") - f.write(f"Overall success ratio: {success_ratio:.2f}\n") - f.write(f"Total time taken: {total_time:.2f} seconds\n") - f.write(f"Average time per experiment: {total_time / total_count:.2f} seconds\n") - f.write(f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") - - def set_environment_variable_tmux_session(session_name, key, value): """Set an environment variable for the current process.""" subprocess.run(["tmux", "send-keys", "-t", session_name, f"export {key}={value}", "C-m"]) @@ -194,7 +167,9 @@ def launch_server_experiment(task_path, models = [model] * 2 apis = [api] * 2 else: - agent_names = [f"Andy_{session_name}", f"Jill_{session_name}", f"Bob_{session_name}"] + agent_names = [] + for i in range(num_agents): + agent_names.append(f"Agent_{i}_{session_name}") models = [model] * 3 apis = [api] * 3 make_profiles(agent_names, models, apis, template_profile=template_profile, url=url) @@ -205,6 +180,11 @@ def launch_server_experiment(task_path, agent_profiles_str = f"'[\"{agent_profiles[0]}\"]'" elif num_agents == 2: agent_profiles_str = f"'[\"{agent_profiles[0]}\", \"{agent_profiles[1]}\"]'" + else: + agent_profiles_str = "'[" + for agent in agent_profiles[:-1]: + agent_profiles_str += f'\"{agent}\", ' + agent_profiles_str += f"\"{agent_profiles[-1]}\"]'" print(agent_profiles_str) launch_world(server_path, session_name="server_" + session_name, agent_names=agent_names) @@ -218,11 +198,11 @@ def launch_server_experiment(task_path, set_environment_variable_tmux_session(session_name, "INSECURE_CODING", "true") # you need to add the bots to the world first before you can add them as op - cmd = f"node main.js --task_path example_tasks.json --task_id debug_multi_agent_timeout" + cmd = f"node main.js --task_path example_tasks.json --task_id debug_{num_agents}_agent_timeout" subprocess.run(["tmux", "send-keys", "-t", session_name, cmd, "C-m"]) - time.sleep(20) + time.sleep(40) # add the bots as op for agent in agent_names: diff --git a/example_tasks.json b/example_tasks.json index 4778439..d966103 100644 --- a/example_tasks.json +++ b/example_tasks.json @@ -17,7 +17,7 @@ }, "type": "debug" }, - "debug_multi_agent_timeout": { + "debug_2_agent_timeout": { "goal": "Just stand at a place and don't do anything", "agent_count": 2, "initial_inventory": { @@ -29,7 +29,67 @@ } }, "type": "debug", - "timeout": 30 + "timeout": 60 + }, + "debug_3_agent_timeout": { + "goal": "Just stand at a place and don't do anything", + "agent_count": 3, + "initial_inventory": { + "0": { + "iron_ingot": 1 + }, + "1": { + "iron_ingot": 1 + }, + "2": { + "iron_ingot": 1 + } + }, + "type": "debug", + "timeout": 60 + }, + "debug_4_agent_timeout": { + "goal": "Just stand at a place and don't do anything", + "agent_count": 4, + "initial_inventory": { + "0": { + "iron_ingot": 1 + }, + "1": { + "iron_ingot": 1 + }, + "2": { + "iron_ingot": 1 + }, + "3": { + "iron_ingot": 1 + } + }, + "type": "debug", + "timeout": 60 + }, + "debug_5_agent_timeout": { + "goal": "Just stand at a place and don't do anything", + "agent_count": 5, + "initial_inventory": { + "0": { + "iron_ingot": 1 + }, + "1": { + "iron_ingot": 1 + }, + "2": { + "iron_ingot": 1 + }, + "3": { + "iron_ingot": 1 + }, + "4": { + "iron_ingot": 1 + } + }, + "type": "debug", + "timeout": 60 }, "debug_different_goal": { "goal": { diff --git a/profiles/tasks/collab_profile.json b/profiles/tasks/collab_profile.json index df422ac..f0cf090 100644 --- a/profiles/tasks/collab_profile.json +++ b/profiles/tasks/collab_profile.json @@ -34,7 +34,14 @@ {"role": "assistant", "content": "Seems like we need are missing 1 peony which we can use craft 1 pink_dye which is an intermediate item, but since you already have a pink_dye, there is no need to search for peony which is used to craft pink_dye, can you pass the pink_dye to me?"}, {"role": "user", "content": "terrance: (FROM OTHER BOT) Sure, here you go !givePlayer(\"pink_dye\", 1, \"jacob\")"}, {"role": "assistant", "content": "Thanks for the pink_dye, now I can craft the pink_wool. !craftItem(\"pink_wool\", 1)"} + ], + [ + {"role": "user", "content": "terrance: (FROM OTHER BOT) Lets craft a cyan bed together"}, + {"role": "assistant", "content": "Sure, I have !inventory\n I have a cyan_dye and oak_planks. What do you have? "}, + {"role": "user", "content": "terrance: (FROM OTHER BOT) I have 1 cyan_wool and 1 oak_planks. Let me give those to you !givePlayer(\"cyan_wool\", 1, \"andy\") !givePlayer(\"oak_planks\", 1, \"andy\")"}, + {"role": "assistant", "content": "Hmm seems like we don't have enough to craft a cyan bed, let me reach out to the third agent and ask for help! !startConversation(\"jacob\", Hey jacob, can you help us craft a cyan bed? We need cyan_dye and oak_planks"}, + {"role": "user", "content": "jacob: (FROM OTHER BOT) Sure! Let me give those to you !givePlayer(\"cyan_dye\", 1, \"andy\") !givePlayer(\"oak_planks\", 1, \"andy\")"}, + {"role": "assistant", "content": "Thanks for the items, now I can craft the cyan bed. !craftItem(\"cyan_bed\", 1)"} ] ] - } \ No newline at end of file