small fix to loading with cheats

2025-08-20 22:25:50 +02:00 · 2025-03-13 21:31:16 -07:00 · 2025-03-13 21:31:16 -07:00 · d8e933a25d
commit d8e933a25d
parent 7ca26a70b4
4 changed files with 81 additions and 34 deletions
--- a/analyse_results.py
+++ b/analyse_results.py
@ -155,7 +155,7 @@ def aggregate_results(local_folders):
            success = int(extract_result(folder_path))
            successful += success

-            if "missing" in folder_path:
+            if "missing" in folder_path and not is_base(folder_path):
                missing_successful += success
                missing_total += 1
            if is_base(folder_path):
--- a/evaluation_script.py
+++ b/evaluation_script.py
@ -71,35 +71,8 @@ def check_task_completion(agents):
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Error reading memory for agent {agent}: {e}")
            continue
-            
    return False  # Default to failure if no conclusive result found

-def update_results_file(task_id, success_count, total_count, time_taken, experiment_results, results_filename):
-    """Update the results file with current success ratio and time taken."""
-    success_ratio = success_count / total_count
-    
-    with open(results_filename, 'w') as f:  # 'w' mode overwrites the file each time
-        f.write(f"Task ID: {task_id}\n")
-        f.write(f"Experiments completed: {total_count}\n")
-        f.write(f"Successful experiments: {success_count}\n")
-        f.write(f"Success ratio: {success_ratio:.2f}\n")
-        f.write(f"Time taken for last experiment: {time_taken:.2f} seconds\n")
-        
-        # Write individual experiment results
-        for i, result in enumerate(experiment_results, 1):
-            f.write(f"Experiment {i}: {'Success' if result['success'] else 'Failure'}, Time taken: {result['time_taken']:.2f} seconds\n")
-        
-        # Write aggregated metrics
-        total_time = sum(result['time_taken'] for result in experiment_results)
-        f.write(f"\nAggregated metrics:\n")
-        f.write(f"Total experiments: {total_count}\n")
-        f.write(f"Total successful experiments: {success_count}\n")
-        f.write(f"Overall success ratio: {success_ratio:.2f}\n")
-        f.write(f"Total time taken: {total_time:.2f} seconds\n")
-        f.write(f"Average time per experiment: {total_time / total_count:.2f} seconds\n")
-        f.write(f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
-
-
 def set_environment_variable_tmux_session(session_name, key, value):
    """Set an environment variable for the current process."""
    subprocess.run(["tmux", "send-keys", "-t", session_name, f"export {key}={value}", "C-m"])
@ -194,7 +167,9 @@ def launch_server_experiment(task_path,
        models = [model] * 2
        apis = [api] * 2
    else:
-        agent_names = [f"Andy_{session_name}", f"Jill_{session_name}", f"Bob_{session_name}"]
+        agent_names = []
+        for i in range(num_agents):
+            agent_names.append(f"Agent_{i}_{session_name}")
        models = [model] * 3
        apis = [api] * 3
    make_profiles(agent_names, models, apis, template_profile=template_profile, url=url)
@ -205,6 +180,11 @@ def launch_server_experiment(task_path,
        agent_profiles_str = f"'[\"{agent_profiles[0]}\"]'"
    elif num_agents == 2:
        agent_profiles_str = f"'[\"{agent_profiles[0]}\", \"{agent_profiles[1]}\"]'"
+    else: 
+        agent_profiles_str = "'["
+        for agent in agent_profiles[:-1]:
+            agent_profiles_str += f'\"{agent}\", '
+        agent_profiles_str += f"\"{agent_profiles[-1]}\"]'"
    print(agent_profiles_str)
    launch_world(server_path, session_name="server_" + session_name, agent_names=agent_names)

@ -218,11 +198,11 @@ def launch_server_experiment(task_path,
        set_environment_variable_tmux_session(session_name, "INSECURE_CODING", "true")

    # you need to add the bots to the world first before you can add them as op
-    cmd = f"node main.js --task_path example_tasks.json --task_id debug_multi_agent_timeout"
+    cmd = f"node main.js --task_path example_tasks.json --task_id debug_{num_agents}_agent_timeout"

    subprocess.run(["tmux", "send-keys", "-t", session_name, cmd, "C-m"])

-    time.sleep(20)
+    time.sleep(40)

    # add the bots as op
    for agent in agent_names:
--- a/example_tasks.json
+++ b/example_tasks.json
@ -17,7 +17,7 @@
        },
        "type": "debug"
    },
-    "debug_multi_agent_timeout": {
+    "debug_2_agent_timeout": {
        "goal": "Just stand at a place and don't do anything",
        "agent_count": 2,
        "initial_inventory": {
@ -29,7 +29,67 @@
            }
        },
        "type": "debug",
-        "timeout": 30
+        "timeout": 60
+    },
+    "debug_3_agent_timeout": {
+        "goal": "Just stand at a place and don't do anything",
+        "agent_count": 3,
+        "initial_inventory": {
+            "0": {
+                "iron_ingot": 1
+            },
+            "1": {
+                "iron_ingot": 1
+            },
+            "2": {
+                "iron_ingot": 1
+            }
+        },
+        "type": "debug",
+        "timeout": 60
+    }, 
+    "debug_4_agent_timeout": {
+        "goal": "Just stand at a place and don't do anything",
+        "agent_count": 4,
+        "initial_inventory": {
+            "0": {
+                "iron_ingot": 1
+            },
+            "1": {
+                "iron_ingot": 1
+            },
+            "2": {
+                "iron_ingot": 1
+            },
+            "3": {
+                "iron_ingot": 1
+            }
+        },
+        "type": "debug",
+        "timeout": 60
+    },
+    "debug_5_agent_timeout": {
+        "goal": "Just stand at a place and don't do anything",
+        "agent_count": 5,
+        "initial_inventory": {
+            "0": {
+                "iron_ingot": 1
+            },
+            "1": {
+                "iron_ingot": 1
+            },
+            "2": {
+                "iron_ingot": 1
+            },
+            "3": {
+                "iron_ingot": 1
+            },
+            "4": {
+                "iron_ingot": 1
+            }
+        },
+        "type": "debug",
+        "timeout": 60
    },
    "debug_different_goal": {
        "goal": {
--- a/profiles/tasks/collab_profile.json
+++ b/profiles/tasks/collab_profile.json
@ -34,7 +34,14 @@
            {"role": "assistant", "content": "Seems like we need are missing 1 peony which we can use craft 1 pink_dye which is an intermediate item, but since you already have a pink_dye, there is no need to search for peony which is used to craft pink_dye, can you pass the pink_dye to me?"},
            {"role": "user", "content": "terrance: (FROM OTHER BOT) Sure, here you go !givePlayer(\"pink_dye\", 1, \"jacob\")"},
            {"role": "assistant", "content": "Thanks for the pink_dye, now I can craft the pink_wool. !craftItem(\"pink_wool\", 1)"}
+        ], 
+        [
+            {"role": "user", "content": "terrance: (FROM OTHER BOT) Lets craft a cyan bed together"},
+            {"role": "assistant", "content": "Sure, I have !inventory\n I have a cyan_dye and oak_planks. What do you have? "},
+            {"role": "user", "content": "terrance: (FROM OTHER BOT) I have 1 cyan_wool and 1 oak_planks. Let me give those to you !givePlayer(\"cyan_wool\", 1, \"andy\") !givePlayer(\"oak_planks\", 1, \"andy\")"},
+            {"role": "assistant", "content": "Hmm seems like we don't have enough to craft a cyan bed, let me reach out to the third agent and ask for help! !startConversation(\"jacob\", Hey jacob, can you help us craft a cyan bed? We need cyan_dye and oak_planks"},
+            {"role": "user", "content": "jacob: (FROM OTHER BOT) Sure! Let me give those to you !givePlayer(\"cyan_dye\", 1, \"andy\") !givePlayer(\"oak_planks\", 1, \"andy\")"},
+            {"role": "assistant", "content": "Thanks for the items, now I can craft the cyan bed. !craftItem(\"cyan_bed\", 1)"}
        ]
    ]
-
 }