mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-04-29 19:44:53 +02:00
222 lines
8.4 KiB
Python
222 lines
8.4 KiB
Python
![]() |
import os
|
||
|
import json
|
||
|
import re
|
||
|
from collections import defaultdict
|
||
|
|
||
|
def extract_cooking_items(exp_dir):
|
||
|
"""Extract cooking items from experiment directory name."""
|
||
|
# Remove prefix and blocked access part
|
||
|
clean_name = re.sub(r'^multiagent_cooking_', '', exp_dir)
|
||
|
clean_name = re.sub(r'_blocked_access_[0-9_]+$', '', clean_name)
|
||
|
|
||
|
# Extract individual items
|
||
|
items = []
|
||
|
for item_match in re.finditer(r'([0-9]+)_([a-zA-Z_]+)', clean_name):
|
||
|
count = int(item_match.group(1))
|
||
|
item = item_match.group(2)
|
||
|
# Remove trailing underscores to fix the item name issue
|
||
|
item = item.rstrip('_')
|
||
|
items.append(item)
|
||
|
|
||
|
return items
|
||
|
|
||
|
def analyze_experiments(root_dir):
|
||
|
# Store results by number of blocked agents
|
||
|
blocked_access_results = defaultdict(lambda: {
|
||
|
"success": 0,
|
||
|
"total": 0,
|
||
|
"cake_success": 0,
|
||
|
"cake_total": 0,
|
||
|
"non_cake_success": 0,
|
||
|
"non_cake_total": 0
|
||
|
})
|
||
|
|
||
|
# Store results by cooking item
|
||
|
cooking_item_results = defaultdict(lambda: {
|
||
|
"success": 0,
|
||
|
"total": 0
|
||
|
})
|
||
|
|
||
|
# Keep track of all unique cooking items
|
||
|
all_cooking_items = set()
|
||
|
|
||
|
# Get a list of all experiment directories
|
||
|
experiment_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))
|
||
|
and d.startswith("multiagent_cooking_")]
|
||
|
|
||
|
for exp_dir in experiment_dirs:
|
||
|
# Extract cooking items
|
||
|
cooking_items = extract_cooking_items(exp_dir)
|
||
|
|
||
|
# Add to unique items set
|
||
|
all_cooking_items.update(cooking_items)
|
||
|
|
||
|
# Check if experiment involves cake
|
||
|
has_cake = any(item == "cake" for item in cooking_items)
|
||
|
|
||
|
# Extract blocked access information from directory name
|
||
|
blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir)
|
||
|
|
||
|
if blocked_access_match:
|
||
|
blocked_access_str = blocked_access_match.group(1)
|
||
|
# Count how many agents have blocked access
|
||
|
num_blocked_agents = len(blocked_access_str.split('_'))
|
||
|
blocked_key = f"{num_blocked_agents} agent(s)"
|
||
|
else:
|
||
|
# No agents blocked
|
||
|
blocked_key = "0 agent(s)"
|
||
|
|
||
|
# Check if the task was successful
|
||
|
is_successful = False
|
||
|
full_exp_path = os.path.join(root_dir, exp_dir)
|
||
|
|
||
|
# Get all JSON files in the experiment directory
|
||
|
agent_files = [f for f in os.listdir(full_exp_path) if f.endswith(".json")]
|
||
|
|
||
|
# Check each agent file for success information
|
||
|
for agent_file in agent_files:
|
||
|
agent_file_path = os.path.join(full_exp_path, agent_file)
|
||
|
|
||
|
try:
|
||
|
with open(agent_file_path, 'r') as f:
|
||
|
agent_data = json.load(f)
|
||
|
|
||
|
# Check for success in the turns data
|
||
|
if "turns" in agent_data:
|
||
|
for turn in agent_data["turns"]:
|
||
|
if turn.get("role") == "system" and "content" in turn:
|
||
|
if isinstance(turn["content"], str) and "Task ended with score : 1" in turn["content"]:
|
||
|
is_successful = True
|
||
|
break
|
||
|
|
||
|
# If we found success, no need to check other files
|
||
|
if is_successful:
|
||
|
break
|
||
|
|
||
|
except (json.JSONDecodeError, IOError) as e:
|
||
|
print(f"Error reading {agent_file_path}: {e}")
|
||
|
# Continue to check other agent files instead of failing
|
||
|
continue
|
||
|
|
||
|
# Update cooking item results
|
||
|
for item in cooking_items:
|
||
|
cooking_item_results[item]["total"] += 1
|
||
|
if is_successful:
|
||
|
cooking_item_results[item]["success"] += 1
|
||
|
|
||
|
# Update the appropriate blocked access counters
|
||
|
# First update the category-specific counters
|
||
|
if has_cake:
|
||
|
blocked_access_results[blocked_key]["cake_total"] += 1
|
||
|
if is_successful:
|
||
|
blocked_access_results[blocked_key]["cake_success"] += 1
|
||
|
else:
|
||
|
blocked_access_results[blocked_key]["non_cake_total"] += 1
|
||
|
if is_successful:
|
||
|
blocked_access_results[blocked_key]["non_cake_success"] += 1
|
||
|
|
||
|
# Only count non-cake experiments in the main totals
|
||
|
blocked_access_results[blocked_key]["total"] += 1
|
||
|
if is_successful:
|
||
|
blocked_access_results[blocked_key]["success"] += 1
|
||
|
|
||
|
return blocked_access_results, cooking_item_results, all_cooking_items
|
||
|
|
||
|
def print_blocked_results(results):
|
||
|
print("\nExperiment Results by Number of Agents with Blocked Access (Excluding Cake Experiments):")
|
||
|
print("=" * 80)
|
||
|
print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15} | {'Cake Tasks':<15} | {'Non-Cake Tasks':<15}")
|
||
|
print("-" * 80)
|
||
|
|
||
|
# Calculate totals
|
||
|
total_success = 0
|
||
|
total_experiments = 0
|
||
|
total_cake = 0
|
||
|
total_non_cake = 0
|
||
|
|
||
|
# Sort by number of blocked agents
|
||
|
for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
|
||
|
success = results[key]["success"]
|
||
|
total = results[key]["total"]
|
||
|
cake_total = results[key]["cake_total"]
|
||
|
non_cake_total = results[key]["non_cake_total"]
|
||
|
|
||
|
# Verify that non_cake_total matches total
|
||
|
if non_cake_total != total:
|
||
|
print(f"Warning: Non-cake total ({non_cake_total}) doesn't match the total ({total}) for {key}")
|
||
|
|
||
|
total_success += success
|
||
|
total_experiments += total
|
||
|
total_cake += cake_total
|
||
|
total_non_cake += non_cake_total
|
||
|
|
||
|
success_rate = (success / total * 100) if total > 0 else 0
|
||
|
|
||
|
print(f"{key:<15} | {success_rate:>6.2f}% | {success}/{total:<13} | {cake_total:<15} | {non_cake_total:<15}")
|
||
|
|
||
|
# Calculate overall success rate (excluding cake experiments)
|
||
|
overall_success_rate = (total_success / total_experiments * 100) if total_experiments > 0 else 0
|
||
|
|
||
|
print("-" * 80)
|
||
|
print(f"{'Overall':<15} | {overall_success_rate:>6.2f}% | {total_success}/{total_experiments:<13} | {total_cake:<15} | {total_non_cake:<15}")
|
||
|
|
||
|
# Print cake experiment details
|
||
|
print("\nCake Experiment Details:")
|
||
|
print("=" * 60)
|
||
|
print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15}")
|
||
|
print("-" * 60)
|
||
|
|
||
|
cake_total_success = 0
|
||
|
cake_total_experiments = 0
|
||
|
|
||
|
for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
|
||
|
cake_success = results[key]["cake_success"]
|
||
|
cake_total = results[key]["cake_total"]
|
||
|
|
||
|
cake_total_success += cake_success
|
||
|
cake_total_experiments += cake_total
|
||
|
|
||
|
cake_success_rate = (cake_success / cake_total * 100) if cake_total > 0 else 0
|
||
|
|
||
|
print(f"{key:<15} | {cake_success_rate:>6.2f}% | {cake_success}/{cake_total}")
|
||
|
|
||
|
cake_overall_success_rate = (cake_total_success / cake_total_experiments * 100) if cake_total_experiments > 0 else 0
|
||
|
|
||
|
print("-" * 60)
|
||
|
print(f"{'Overall':<15} | {cake_overall_success_rate:>6.2f}% | {cake_total_success}/{cake_total_experiments}")
|
||
|
|
||
|
def print_cooking_items(cooking_items):
|
||
|
print("\nUnique Cooking Items Found:")
|
||
|
print("=" * 60)
|
||
|
print(", ".join(sorted(cooking_items)))
|
||
|
print(f"Total unique items: {len(cooking_items)}")
|
||
|
|
||
|
def print_item_results(item_results):
|
||
|
print("\nExperiment Results by Cooking Item:")
|
||
|
print("=" * 60)
|
||
|
print(f"{'Cooking Item':<20} | {'Success Rate':<15} | {'Success/Total':<15}")
|
||
|
print("-" * 60)
|
||
|
|
||
|
# Sort by item name
|
||
|
for item in sorted(item_results.keys()):
|
||
|
success = item_results[item]["success"]
|
||
|
total = item_results[item]["total"]
|
||
|
success_rate = (success / total * 100) if total > 0 else 0
|
||
|
|
||
|
print(f"{item:<20} | {success_rate:>6.2f}% | {success}/{total}")
|
||
|
|
||
|
print("-" * 60)
|
||
|
|
||
|
def main():
|
||
|
# Update this path to your experiments directory
|
||
|
experiments_root = "../results/llama_70b_hells_kitchen_cooking_tasks"
|
||
|
|
||
|
print(f"Analyzing experiments in: {os.path.abspath(experiments_root)}")
|
||
|
blocked_results, item_results, unique_items = analyze_experiments(experiments_root)
|
||
|
|
||
|
print_blocked_results(blocked_results)
|
||
|
print_cooking_items(unique_items)
|
||
|
print_item_results(item_results)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|