Updates on construction and cooking tasks, prettyTable, flexibility to enter multiple folders, ....

This commit is contained in:
Ayush Maniar 2025-03-28 13:13:16 -07:00
parent 63e7861c4f
commit d39b254a06
2 changed files with 200 additions and 190 deletions

View file

@ -9,11 +9,11 @@ def extract_success_scores(folders, model_names):
all_task_scores = defaultdict(dict) # Stores task-wise scores per model all_task_scores = defaultdict(dict) # Stores task-wise scores per model
zero_score_tasks = defaultdict(list) # Stores tasks with 0 score per model zero_score_tasks = defaultdict(list) # Stores tasks with 0 score per model
null_score_tasks = defaultdict(list) # Stores tasks with null score per model
material_groups = defaultdict(lambda: defaultdict(list)) material_groups = defaultdict(lambda: defaultdict(list))
room_groups = defaultdict(lambda: defaultdict(list)) room_groups = defaultdict(lambda: defaultdict(list))
material_room_groups = defaultdict(lambda: defaultdict(list)) material_room_groups = defaultdict(lambda: defaultdict(list))
overall_scores = defaultdict(list) # New dict to store all scores for each model overall_scores = defaultdict(list) # New dict to store all scores for each model
skipped_tasks = defaultdict(list) # Stores tasks with no score message per model
pattern = re.compile(r"materials_(\d+)_rooms_(\d+)") pattern = re.compile(r"materials_(\d+)_rooms_(\d+)")
@ -50,22 +50,22 @@ def extract_success_scores(folders, model_names):
print(f"Error reading {file_path}: {e}") print(f"Error reading {file_path}: {e}")
if logs_found and not score_found: if logs_found and not score_found:
# Score not found but logs exist - mark as null # Score not found but logs exist - skip this task
all_task_scores[task_folder][model_name] = None skipped_tasks[model_name].append(task_folder)
null_score_tasks[model_name].append(task_folder) print(f"Error: No score message found for task '{task_folder}' with model '{model_name}'. Skipping this task.")
if not logs_found: if not logs_found:
print(f"No log files found in {task_folder}") print(f"No log files found in {task_folder}")
# Calculate model completion rates (ignore null scores) # Calculate model completion rates (only consider tasks with scores)
model_completion_rates = {} model_completion_rates = {}
for model_name in model_names: for model_name in model_names:
valid_tasks = [task for task in all_task_scores.keys() if model_name in all_task_scores[task] and all_task_scores[task][model_name] is not None] valid_tasks = [task for task in all_task_scores.keys() if model_name in all_task_scores[task]]
total_tasks = len(valid_tasks) total_tasks = len(valid_tasks)
completed_tasks = len([task for task in valid_tasks if all_task_scores[task][model_name] > 0]) completed_tasks = len([task for task in valid_tasks if all_task_scores[task][model_name] > 0])
model_completion_rates[model_name] = (completed_tasks / total_tasks) if total_tasks > 0 else 0 model_completion_rates[model_name] = (completed_tasks / total_tasks) if total_tasks > 0 else 0
# Process task scores into groups (ignore null and 0 scores) # Process task scores into groups (ignore 0 scores)
for task, model_scores in all_task_scores.items(): for task, model_scores in all_task_scores.items():
match = pattern.search(task) match = pattern.search(task)
if match: if match:
@ -73,7 +73,7 @@ def extract_success_scores(folders, model_names):
room = int(match.group(2)) room = int(match.group(2))
for model, score in model_scores.items(): for model, score in model_scores.items():
if score is not None and score > 0: # Ignore null and 0 scores if score > 0: # Ignore 0 scores
material_groups[material][model].append(score) material_groups[material][model].append(score)
room_groups[room][model].append(score) room_groups[room][model].append(score)
material_room_groups[(material, room)][model].append(score) material_room_groups[(material, room)][model].append(score)
@ -102,14 +102,14 @@ def extract_success_scores(folders, model_names):
for model in model_names: for model in model_names:
score = all_task_scores[task].get(model) score = all_task_scores[task].get(model)
if score is None: if score is None:
row.append("null") row.append("-")
else: else:
row.append(round(score, 2)) row.append(round(score, 2))
table.add_row(row) table.add_row(row)
print("\nTask-wise Success Scores") print("\nTask-wise Success Scores")
print(table) print(table)
def display_zero_and_null_score_tasks(): def display_zero_and_skipped_tasks():
for model in model_names: for model in model_names:
if zero_score_tasks[model]: if zero_score_tasks[model]:
table = PrettyTable([f"{model} - Tasks with 0 Score"]) table = PrettyTable([f"{model} - Tasks with 0 Score"])
@ -118,28 +118,28 @@ def extract_success_scores(folders, model_names):
print(f"\n{model} - Tasks with 0 Success Score") print(f"\n{model} - Tasks with 0 Success Score")
print(table) print(table)
if null_score_tasks[model]: if skipped_tasks[model]:
table = PrettyTable([f"{model} - Tasks with Null Score"]) table = PrettyTable([f"{model} - Skipped Tasks (No Score Message)"])
for task in null_score_tasks[model]: for task in skipped_tasks[model]:
table.add_row([task]) table.add_row([task])
print(f"\n{model} - Tasks with Null Success Score") print(f"\n{model} - Skipped Tasks (No Score Message)")
print(table) print(table)
def display_overall_averages(): def display_overall_averages():
table = PrettyTable(["Metric"] + model_names) table = PrettyTable(["Metric"] + model_names)
# Overall average score (including zeros, excluding nulls) # Overall average score (including zeros)
row_with_zeros = ["Average Score (All Tasks)"] row_with_zeros = ["Average Score (All Tasks)"]
for model in model_names: for model in model_names:
valid_scores = [s for s in overall_scores[model] if s is not None] valid_scores = overall_scores[model]
avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0 avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0
row_with_zeros.append(round(avg, 2)) row_with_zeros.append(round(avg, 2))
table.add_row(row_with_zeros) table.add_row(row_with_zeros)
# Overall average score (excluding zeros and nulls) # Overall average score (excluding zeros)
row_without_zeros = ["Average Score (Completed Tasks)"] row_without_zeros = ["Average Score (Completed Tasks)"]
for model in model_names: for model in model_names:
completed_scores = [s for s in overall_scores[model] if s is not None and s > 0] completed_scores = [s for s in overall_scores[model] if s > 0]
avg = sum(completed_scores) / len(completed_scores) if completed_scores else 0 avg = sum(completed_scores) / len(completed_scores) if completed_scores else 0
row_without_zeros.append(round(avg, 2)) row_without_zeros.append(round(avg, 2))
table.add_row(row_without_zeros) table.add_row(row_without_zeros)
@ -150,24 +150,30 @@ def extract_success_scores(folders, model_names):
completion_row.append(round(model_completion_rates[model] * 100, 2)) completion_row.append(round(model_completion_rates[model] * 100, 2))
table.add_row(completion_row) table.add_row(completion_row)
# Total number of tasks (excluding nulls) # Total number of tasks
task_count_row = ["Total Tasks"] task_count_row = ["Total Tasks"]
for model in model_names: for model in model_names:
valid_tasks = [task for task in all_task_scores.keys() if model in all_task_scores[task] and all_task_scores[task][model] is not None] valid_tasks = [task for task in all_task_scores.keys() if model in all_task_scores[task]]
task_count_row.append(len(valid_tasks)) task_count_row.append(len(valid_tasks))
table.add_row(task_count_row) table.add_row(task_count_row)
# Number of skipped tasks
skipped_count_row = ["Skipped Tasks"]
for model in model_names:
skipped_count_row.append(len(skipped_tasks[model]))
table.add_row(skipped_count_row)
print("\nOverall Performance Metrics") print("\nOverall Performance Metrics")
print(table) print(table)
display_overall_averages() # Display overall averages first display_overall_averages() # Display overall averages first
display_task_scores() display_task_scores()
display_zero_and_null_score_tasks() display_zero_and_skipped_tasks()
display_table("Average Success Score by Material", avg_material_scores) display_table("Average Success Score by Material", avg_material_scores)
display_table("Average Success Score by Room", avg_room_scores) display_table("Average Success Score by Room", avg_room_scores)
display_table("Average Success Score by (Material, Room) Tuples", avg_material_room_scores, tuple_keys=True) display_table("Average Success Score by (Material, Room) Tuples", avg_material_room_scores, tuple_keys=True)
# Example usage # Example usage
folders = ["experiments/gpt-4o_construction_tasks", "experiments/exp_03-23_12-31"] folders = ["experiments/gpt-4o_construction_tasks", "experiments/claude-3-5-sonnet-latest_construction_tasks"]
model_names = ["GPT-4o","Claude 3.5 sonnet"] model_names = ["GPT-4o", "Claude 3.5 sonnet"]
extract_success_scores(folders, model_names) extract_success_scores(folders, model_names)

View file

@ -2,6 +2,7 @@ import os
import json import json
import re import re
from collections import defaultdict from collections import defaultdict
from prettytable import PrettyTable
def extract_cooking_items(exp_dir): def extract_cooking_items(exp_dir):
"""Extract cooking items from experiment directory name.""" """Extract cooking items from experiment directory name."""
@ -36,8 +37,8 @@ def analyze_experiments(root_dir, model_name):
# Keep track of all unique cooking items # Keep track of all unique cooking items
all_cooking_items = set() all_cooking_items = set()
# Track skipped experiments # Keep track of ignored tasks
skipped_experiments = [] ignored_tasks = []
# Get a list of all experiment directories # Get a list of all experiment directories
experiment_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d)) experiment_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))
@ -78,18 +79,18 @@ def analyze_experiments(root_dir, model_name):
with open(agent_file_path, 'r') as f: with open(agent_file_path, 'r') as f:
agent_data = json.load(f) agent_data = json.load(f)
# Check for success in the turns data # Check for score information in the turns data
if "turns" in agent_data: if "turns" in agent_data:
for turn in agent_data["turns"]: for turn in agent_data["turns"]:
if turn.get("role") == "system" and "content" in turn: if turn.get("role") == "system" and "content" in turn:
if isinstance(turn["content"], str) and "Task ended with score" in turn["content"]: if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]:
score_found = True score_found = True
if "Task ended with score : 1" in turn["content"]: if "Task ended with score : 1" in turn["content"]:
is_successful = True is_successful = True
break break
# If we found score information, no need to check other files # If we found success, no need to check other files
if score_found: if is_successful:
break break
except (json.JSONDecodeError, IOError) as e: except (json.JSONDecodeError, IOError) as e:
@ -97,10 +98,9 @@ def analyze_experiments(root_dir, model_name):
# Continue to check other agent files instead of failing # Continue to check other agent files instead of failing
continue continue
# Skip experiments with no score information # If no score information was found in any agent file, ignore this task
if not score_found: if not score_found:
skipped_experiments.append(exp_dir) ignored_tasks.append(exp_dir)
print(f"Warning: No task score found in experiment {exp_dir} - skipping")
continue continue
# Update cooking item results # Update cooking item results
@ -114,178 +114,195 @@ def analyze_experiments(root_dir, model_name):
if is_successful: if is_successful:
blocked_access_results[blocked_key]["success"] += 1 blocked_access_results[blocked_key]["success"] += 1
return blocked_access_results, cooking_item_results, all_cooking_items, skipped_experiments # Print information about ignored tasks
if ignored_tasks:
print(f"\n{model_name}: Ignored {len(ignored_tasks)} tasks with no score information:")
for task in ignored_tasks:
print(f" - {task}")
return blocked_access_results, cooking_item_results, all_cooking_items, ignored_tasks
def print_model_comparison_blocked(models_results): def print_model_comparison_blocked(models_results):
print("\nModel Comparison by Number of Agents with Blocked Access:") print("\nModel Comparison by Number of Agents with Blocked Access:")
print("=" * 100) print("=" * 100)
# Get all possible blocked access keys # Get all possible blocked access keys
all_blocked_keys = set() all_blocked_keys = set()
for model_results in models_results.values(): for model_results in models_results.values():
all_blocked_keys.update(model_results.keys()) all_blocked_keys.update(model_results.keys())
# Sort the keys # Sort the keys
sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0])) sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0]))
# Create the header # Create the table
header = f"{'Blocked Agents':<15} | " table = PrettyTable()
for model_name in models_results.keys(): table.field_names = ["Blocked Agents"] + [
header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " f"{model_name} (Success Rate | Success/Total)" for model_name in models_results.keys()
print(header) ]
print("-" * 100)
# Calculate and add rows for each blocked key
# Calculate and print the results for each blocked key
model_totals = {model: {"success": 0, "total": 0} for model in models_results.keys()} model_totals = {model: {"success": 0, "total": 0} for model in models_results.keys()}
for key in sorted_keys: for key in sorted_keys:
row = f"{key:<15} | " row = [key]
for model_name, model_results in models_results.items(): for model_name, model_results in models_results.items():
if key in model_results: if key in model_results:
success = model_results[key]["success"] success = model_results[key]["success"]
total = model_results[key]["total"] total = model_results[key]["total"]
model_totals[model_name]["success"] += success model_totals[model_name]["success"] += success
model_totals[model_name]["total"] += total model_totals[model_name]["total"] += total
success_rate = (success / total * 100) if total > 0 else 0 success_rate = (success / total * 100) if total > 0 else 0
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " row.append(f"{success_rate:.2f}% | {success}/{total}")
else: else:
row += f"{'N/A':<19} | {'N/A':<19} | " row.append("N/A")
print(row) table.add_row(row)
# Print the table
print(table)
# Print the overall results # Print the overall results
print("-" * 100) overall_row = ["Overall"]
row = f"{'Overall':<15} | "
for model_name, totals in model_totals.items(): for model_name, totals in model_totals.items():
success = totals["success"] success = totals["success"]
total = totals["total"] total = totals["total"]
success_rate = (success / total * 100) if total > 0 else 0 success_rate = (success / total * 100) if total > 0 else 0
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " overall_row.append(f"{success_rate:.2f}% | {success}/{total}")
print(row) table.add_row(overall_row)
print(table)
def print_model_comparison_items(models_item_results, all_cooking_items): def print_model_comparison_items(models_item_results, all_cooking_items):
print("\nModel Comparison by Cooking Item:") print("\nModel Comparison by Cooking Item:")
print("=" * 100) print("=" * 100)
# Create the header # Create the table
header = f"{'Cooking Item':<20} | " table = PrettyTable()
for model_name in models_item_results.keys(): table.field_names = ["Cooking Item"] + [
header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " f"{model_name} (Success Rate | Success/Total)" for model_name in models_item_results.keys()
print(header) ]
print("-" * 100)
# Calculate and add rows for each cooking item
# Calculate and print the results for each cooking item
model_totals = {model: {"success": 0, "total": 0} for model in models_item_results.keys()} model_totals = {model: {"success": 0, "total": 0} for model in models_item_results.keys()}
for item in sorted(all_cooking_items): for item in sorted(all_cooking_items):
row = f"{item:<20} | " row = [item]
for model_name, model_results in models_item_results.items(): for model_name, model_results in models_item_results.items():
if item in model_results: if item in model_results:
success = model_results[item]["success"] success = model_results[item]["success"]
total = model_results[item]["total"] total = model_results[item]["total"]
model_totals[model_name]["success"] += success model_totals[model_name]["success"] += success
model_totals[model_name]["total"] += total model_totals[model_name]["total"] += total
success_rate = (success / total * 100) if total > 0 else 0 success_rate = (success / total * 100) if total > 0 else 0
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " row.append(f"{success_rate:.2f}% | {success}/{total}")
else: else:
row += f"{'N/A':<19} | {'N/A':<19} | " row.append("N/A")
print(row) table.add_row(row)
# Print the table
print(table)
# Print the overall results # Print the overall results
print("-" * 100) overall_row = ["Overall"]
row = f"{'Overall':<20} | "
for model_name, totals in model_totals.items(): for model_name, totals in model_totals.items():
success = totals["success"] success = totals["success"]
total = totals["total"] total = totals["total"]
success_rate = (success / total * 100) if total > 0 else 0 success_rate = (success / total * 100) if total > 0 else 0
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " overall_row.append(f"{success_rate:.2f}% | {success}/{total}")
print(row) table.add_row(overall_row)
print(table)
def print_model_comparison_items_by_blocked(models_data, all_cooking_items): def print_model_comparison_items_by_blocked(models_data, all_cooking_items):
print("\nDetailed Model Comparison by Cooking Item and Blocked Agent Count:") print("\nDetailed Model Comparison by Cooking Item and Blocked Agent Count:")
print("=" * 120) print("=" * 120)
# For each cooking item, create a comparison table by blocked agent count # For each cooking item, create a comparison table by blocked agent count
for item in sorted(all_cooking_items): for item in sorted(all_cooking_items):
print(f"\nResults for cooking item: {item}") print(f"\nResults for cooking item: {item}")
print("-" * 100) print("-" * 100)
# Create the header # Create the table
header = f"{'Blocked Agents':<15} | " table = PrettyTable()
for model_name in models_data.keys(): table.field_names = ["Blocked Agents"] + [
header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " f"{model_name} Success Rate" for model_name in models_data.keys()
print(header) ] + [
print("-" * 100) f"{model_name} Success/Total" for model_name in models_data.keys()
]
# Get all possible blocked agent counts # Get all possible blocked agent counts
all_blocked_keys = set() all_blocked_keys = set()
for model_name, model_data in models_data.items(): for model_name, model_data in models_data.items():
_, _, item_blocked_data = model_data _, _, item_blocked_data = model_data
for blocked_key in item_blocked_data.get(item, {}).keys(): for blocked_key in item_blocked_data.get(item, {}).keys():
all_blocked_keys.add(blocked_key) all_blocked_keys.add(blocked_key)
# Sort the keys # Sort the keys
sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0])) sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0]))
# Print each row # Add rows for each blocked key
for blocked_key in sorted_keys: for blocked_key in sorted_keys:
row = f"{blocked_key:<15} | " row = [blocked_key]
for model_name, model_data in models_data.items(): for model_name, model_data in models_data.items():
_, _, item_blocked_data = model_data _, _, item_blocked_data = model_data
if item in item_blocked_data and blocked_key in item_blocked_data[item]: if item in item_blocked_data and blocked_key in item_blocked_data[item]:
success = item_blocked_data[item][blocked_key]["success"] success = item_blocked_data[item][blocked_key]["success"]
total = item_blocked_data[item][blocked_key]["total"] total = item_blocked_data[item][blocked_key]["total"]
if total > 0: if total > 0:
success_rate = (success / total * 100) success_rate = (success / total * 100)
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " row.append(f"{success_rate:.2f}%")
row.append(f"{success}/{total}")
else: else:
row += f"{'N/A':<19} | {'0/0':<19} | " row.append("N/A")
row.append("0/0")
else: else:
row += f"{'N/A':<19} | {'N/A':<19} | " row.append("N/A")
row.append("N/A")
print(row)
table.add_row(row)
# Print the table
print(table)
# Print item summary for each model # Print item summary for each model
print("-" * 100) overall_row = ["Overall"]
row = f"{'Overall':<15} | "
for model_name, model_data in models_data.items(): for model_name, model_data in models_data.items():
_, item_results, _ = model_data _, item_results, _ = model_data
if item in item_results: if item in item_results:
success = item_results[item]["success"] success = item_results[item]["success"]
total = item_results[item]["total"] total = item_results[item]["total"]
if total > 0: if total > 0:
success_rate = (success / total * 100) success_rate = (success / total * 100)
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " overall_row.append(f"{success_rate:.2f}%")
overall_row.append(f"{success}/{total}")
else: else:
row += f"{'N/A':<19} | {'0/0':<19} | " overall_row.append("N/A")
overall_row.append("0/0")
else: else:
row += f"{'N/A':<19} | {'N/A':<19} | " overall_row.append("N/A")
overall_row.append("N/A")
print(row)
table.add_row(overall_row)
print(table)
def generate_item_blocked_data(experiments_root): def generate_item_blocked_data(experiments_root):
# Organize data by item and blocked agent count # Organize data by item and blocked agent count
item_blocked_data = defaultdict(lambda: defaultdict(lambda: {"success": 0, "total": 0})) item_blocked_data = defaultdict(lambda: defaultdict(lambda: {"success": 0, "total": 0}))
# Track skipped experiments # Keep track of ignored tasks
skipped_experiments = [] ignored_tasks = []
# Populate the data structure # Populate the data structure
for exp_dir in os.listdir(experiments_root): for exp_dir in os.listdir(experiments_root):
@ -304,7 +321,7 @@ def generate_item_blocked_data(experiments_root):
else: else:
blocked_key = "0 agent(s)" blocked_key = "0 agent(s)"
# Check if the task was successful # Check if the task was successful and if score information exists
is_successful = False is_successful = False
score_found = False score_found = False
full_exp_path = os.path.join(experiments_root, exp_dir) full_exp_path = os.path.join(experiments_root, exp_dir)
@ -318,103 +335,90 @@ def generate_item_blocked_data(experiments_root):
if "turns" in agent_data: if "turns" in agent_data:
for turn in agent_data["turns"]: for turn in agent_data["turns"]:
if turn.get("role") == "system" and "content" in turn: if turn.get("role") == "system" and "content" in turn:
if isinstance(turn["content"], str) and "Task ended with score" in turn["content"]: if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]:
score_found = True score_found = True
if "Task ended with score : 1" in turn["content"]: if "Task ended with score : 1" in turn["content"]:
is_successful = True is_successful = True
break break
if score_found: if is_successful:
break break
except: except:
continue continue
# Skip experiments with no score information # If no score information was found, skip this task
if not score_found: if not score_found:
skipped_experiments.append(exp_dir) ignored_tasks.append(exp_dir)
continue continue
# Update the item-blocked data # Update the item-blocked data
for item in cooking_items: for item in cooking_items:
item_blocked_data[item][blocked_key]["total"] += 1 item_blocked_data[item][blocked_key]["total"] += 1
if is_successful: if is_successful:
item_blocked_data[item][blocked_key]["success"] += 1 item_blocked_data[item][blocked_key]["success"] += 1
return item_blocked_data, skipped_experiments return item_blocked_data, ignored_tasks
def main(): def main():
base_dir = "experiments" # Define lists for model directories and corresponding model names
model_dirs = [
# Get the model directories "experiments/gpt-4o_2agent_NEW_cooking_tasks",
all_model_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] # "experiments/claude-3-5-sonnet_2agent_NEW_cooking_tasks",
gpt_dirs = [d for d in all_model_dirs if d.startswith("gpt-4o_30_cooking_tasks")] # "experiments/claude-3-5-sonnet_3agent_NEW_cooking_tasks",
claude_dirs = [d for d in all_model_dirs if d.startswith("claude-3-5-sonnet-latest_30_cooking_tasks")] "experiments/gpt-4o_3agent_NEW_cooking_tasks",
# "experiments/1_claude-3-5-sonnet_4agents_NEW_cooking_tasks",
if not gpt_dirs or not claude_dirs: "experiments/gpt-4o_4agents_NEW_cooking_tasks",
print("Error: Could not find both model directories. Please check your paths.") "experiments/gpt-4o_5agents_NEW_cooking_tasks",
# "experiments/"
]
model_names = [
"GPT-4o-2agent",
# "Claude-3.5-2agent",
"GPT-4o-3agent",
# "Claude-3.5-3agent",
# "Claude-3.5-4agent",
"GPT-4o-4agent",
"GPT-4o-5agent",
# "Another-Model"
]
# Ensure both lists are of the same size
if len(model_dirs) != len(model_names):
print("Error: The number of model directories and model names must be the same.")
return return
# Use the first directory found for each model
gpt_dir = os.path.join(base_dir, gpt_dirs[0])
claude_dir = os.path.join(base_dir, claude_dirs[0])
print(f"Analyzing GPT-4o experiments in: {gpt_dir}")
print(f"Analyzing Claude-3.5-Sonnet experiments in: {claude_dir}")
# Analyze each model directory # Analyze each model directory
gpt_blocked_results, gpt_item_results, gpt_unique_items, gpt_skipped = analyze_experiments(gpt_dir, "GPT-4o") models_blocked_results = {}
claude_blocked_results, claude_item_results, claude_unique_items, claude_skipped = analyze_experiments(claude_dir, "Claude-3.5") models_item_results = {}
all_cooking_items = set()
# Combine unique cooking items total_ignored_tasks = 0
all_cooking_items = gpt_unique_items.union(claude_unique_items)
for model_dir, model_name in zip(model_dirs, model_names):
# Generate item-blocked data for each model print(f"Analyzing {model_name} experiments in: {model_dir}")
gpt_item_blocked_data, gpt_skipped_detailed = generate_item_blocked_data(gpt_dir)
claude_item_blocked_data, claude_skipped_detailed = generate_item_blocked_data(claude_dir) blocked_results, item_results, unique_items, ignored_tasks = analyze_experiments(model_dir, model_name)
# Create model comparison data structures models_blocked_results[model_name] = blocked_results
models_blocked_results = { models_item_results[model_name] = item_results
"GPT-4o": gpt_blocked_results, all_cooking_items.update(unique_items)
"Claude-3.5": claude_blocked_results total_ignored_tasks += len(ignored_tasks)
}
if ignored_tasks:
models_item_results = { print(f" - {model_name}: Ignored {len(ignored_tasks)} tasks with no score information.")
"GPT-4o": gpt_item_results,
"Claude-3.5": claude_item_results # Print summary of ignored tasks
} if total_ignored_tasks > 0:
print(f"\nTotal ignored tasks (missing score information): {total_ignored_tasks}")
models_data = {
"GPT-4o": (gpt_blocked_results, gpt_item_results, gpt_item_blocked_data),
"Claude-3.5": (claude_blocked_results, claude_item_results, claude_item_blocked_data)
}
# Print the comparison tables # Print the comparison tables
print_model_comparison_blocked(models_blocked_results) print_model_comparison_blocked(models_blocked_results)
print_model_comparison_items(models_item_results, all_cooking_items) print_model_comparison_items(models_item_results, all_cooking_items)
print_model_comparison_items_by_blocked(models_data, all_cooking_items)
# Print overall statistics # Print overall statistics
print("\nUnique Cooking Items Found:") print("\nUnique Cooking Items Found:")
print("=" * 60) print("=" * 60)
print(", ".join(sorted(all_cooking_items))) print(", ".join(sorted(all_cooking_items)))
print(f"Total unique items: {len(all_cooking_items)}") print(f"Total unique items: {len(all_cooking_items)}")
# Print skipped experiment information
print("\nSkipped Experiments (No Score Information):")
print("=" * 60)
print(f"GPT-4o: {len(gpt_skipped)} experiments skipped")
print(f"Claude-3.5: {len(claude_skipped)} experiments skipped")
if gpt_skipped or claude_skipped:
print("\nSkipped experiment directories:")
if gpt_skipped:
print("GPT-4o:")
for exp in gpt_skipped:
print(f" - {exp}")
if claude_skipped:
print("Claude-3.5:")
for exp in claude_skipped:
print(f" - {exp}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()