2025-03-16 17:55:05 -07:00
|
|
|
import os
|
|
|
|
import json
|
2025-03-22 21:36:57 -07:00
|
|
|
from collections import defaultdict
|
|
|
|
from prettytable import PrettyTable
|
2025-03-16 17:55:05 -07:00
|
|
|
import re
|
2025-03-22 21:36:57 -07:00
|
|
|
|
2025-03-23 14:23:10 -07:00
|
|
|
def extract_success_scores(folders, model_names):
|
|
|
|
assert len(folders) == len(model_names), "Folders and model names lists must have the same length."
|
|
|
|
|
|
|
|
all_task_scores = defaultdict(dict) # Stores task-wise scores per model
|
|
|
|
zero_score_tasks = defaultdict(list) # Stores tasks with 0 score per model
|
|
|
|
null_score_tasks = defaultdict(list) # Stores tasks with null score per model
|
|
|
|
material_groups = defaultdict(lambda: defaultdict(list))
|
|
|
|
room_groups = defaultdict(lambda: defaultdict(list))
|
|
|
|
material_room_groups = defaultdict(lambda: defaultdict(list))
|
|
|
|
overall_scores = defaultdict(list) # New dict to store all scores for each model
|
2025-03-22 21:36:57 -07:00
|
|
|
|
|
|
|
pattern = re.compile(r"materials_(\d+)_rooms_(\d+)")
|
2025-03-23 14:23:10 -07:00
|
|
|
|
|
|
|
for root_dir, model_name in zip(folders, model_names):
|
|
|
|
for task_folder in os.listdir(root_dir):
|
|
|
|
task_path = os.path.join(root_dir, task_folder)
|
|
|
|
if os.path.isdir(task_path):
|
|
|
|
logs_found = False
|
|
|
|
score_found = False
|
|
|
|
|
|
|
|
for file_name in os.listdir(task_path):
|
|
|
|
if file_name.endswith(".json"):
|
|
|
|
logs_found = True
|
|
|
|
file_path = os.path.join(task_path, file_name)
|
|
|
|
|
|
|
|
try:
|
|
|
|
with open(file_path, 'r') as file:
|
|
|
|
data = json.load(file)
|
|
|
|
|
|
|
|
for turn in reversed(data.get("turns", [])):
|
|
|
|
if turn["role"] == "system" and "Task ended with score" in turn["content"]:
|
|
|
|
score = float(turn["content"].split(":")[-1].strip())
|
|
|
|
all_task_scores[task_folder][model_name] = score
|
|
|
|
overall_scores[model_name].append(score) # Add to overall scores
|
|
|
|
score_found = True
|
|
|
|
|
|
|
|
if score == 0:
|
|
|
|
zero_score_tasks[model_name].append(task_folder)
|
|
|
|
break
|
|
|
|
|
|
|
|
if score_found:
|
2025-03-22 21:36:57 -07:00
|
|
|
break
|
2025-03-23 14:23:10 -07:00
|
|
|
except Exception as e:
|
|
|
|
print(f"Error reading {file_path}: {e}")
|
|
|
|
|
|
|
|
if logs_found and not score_found:
|
|
|
|
# Score not found but logs exist - mark as null
|
|
|
|
all_task_scores[task_folder][model_name] = None
|
|
|
|
null_score_tasks[model_name].append(task_folder)
|
|
|
|
|
|
|
|
if not logs_found:
|
|
|
|
print(f"No log files found in {task_folder}")
|
|
|
|
|
|
|
|
# Calculate model completion rates (ignore null scores)
|
|
|
|
model_completion_rates = {}
|
|
|
|
for model_name in model_names:
|
|
|
|
valid_tasks = [task for task in all_task_scores.keys() if model_name in all_task_scores[task] and all_task_scores[task][model_name] is not None]
|
|
|
|
total_tasks = len(valid_tasks)
|
|
|
|
completed_tasks = len([task for task in valid_tasks if all_task_scores[task][model_name] > 0])
|
|
|
|
model_completion_rates[model_name] = (completed_tasks / total_tasks) if total_tasks > 0 else 0
|
|
|
|
|
|
|
|
# Process task scores into groups (ignore null and 0 scores)
|
|
|
|
for task, model_scores in all_task_scores.items():
|
2025-03-22 21:36:57 -07:00
|
|
|
match = pattern.search(task)
|
|
|
|
if match:
|
2025-03-23 14:23:10 -07:00
|
|
|
material = int(match.group(1))
|
|
|
|
room = int(match.group(2))
|
|
|
|
|
|
|
|
for model, score in model_scores.items():
|
|
|
|
if score is not None and score > 0: # Ignore null and 0 scores
|
|
|
|
material_groups[material][model].append(score)
|
|
|
|
room_groups[room][model].append(score)
|
|
|
|
material_room_groups[(material, room)][model].append(score)
|
|
|
|
|
2025-03-22 21:36:57 -07:00
|
|
|
def calculate_average(group):
|
2025-03-23 14:23:10 -07:00
|
|
|
return {key: {model: sum(scores) / len(scores) for model, scores in models.items() if scores}
|
|
|
|
for key, models in group.items() if models}
|
|
|
|
|
2025-03-22 21:36:57 -07:00
|
|
|
avg_material_scores = calculate_average(material_groups)
|
|
|
|
avg_room_scores = calculate_average(room_groups)
|
2025-03-23 14:23:10 -07:00
|
|
|
avg_material_room_scores = calculate_average(material_room_groups)
|
|
|
|
|
|
|
|
def display_table(title, data, tuple_keys=False):
|
|
|
|
table = PrettyTable(["Category"] + model_names)
|
|
|
|
for key, model_scores in sorted(data.items()):
|
|
|
|
key_display = key if not tuple_keys else f"({key[0]}, {key[1]})"
|
|
|
|
row = [key_display] + [round(model_scores.get(model, 0), 2) for model in model_names]
|
|
|
|
table.add_row(row)
|
2025-03-22 21:36:57 -07:00
|
|
|
print(f"\n{title}")
|
|
|
|
print(table)
|
2025-03-23 14:23:10 -07:00
|
|
|
|
2025-03-22 21:36:57 -07:00
|
|
|
def display_task_scores():
|
2025-03-23 14:23:10 -07:00
|
|
|
table = PrettyTable(["Task"] + model_names)
|
|
|
|
for task in sorted(all_task_scores.keys()):
|
|
|
|
row = [task]
|
|
|
|
for model in model_names:
|
|
|
|
score = all_task_scores[task].get(model)
|
|
|
|
if score is None:
|
|
|
|
row.append("null")
|
|
|
|
else:
|
|
|
|
row.append(round(score, 2))
|
|
|
|
table.add_row(row)
|
2025-03-22 21:36:57 -07:00
|
|
|
print("\nTask-wise Success Scores")
|
|
|
|
print(table)
|
2025-03-23 14:23:10 -07:00
|
|
|
|
|
|
|
def display_zero_and_null_score_tasks():
|
|
|
|
for model in model_names:
|
|
|
|
if zero_score_tasks[model]:
|
|
|
|
table = PrettyTable([f"{model} - Tasks with 0 Score"])
|
|
|
|
for task in zero_score_tasks[model]:
|
|
|
|
table.add_row([task])
|
|
|
|
print(f"\n{model} - Tasks with 0 Success Score")
|
|
|
|
print(table)
|
|
|
|
|
|
|
|
if null_score_tasks[model]:
|
|
|
|
table = PrettyTable([f"{model} - Tasks with Null Score"])
|
|
|
|
for task in null_score_tasks[model]:
|
|
|
|
table.add_row([task])
|
|
|
|
print(f"\n{model} - Tasks with Null Success Score")
|
|
|
|
print(table)
|
|
|
|
|
|
|
|
def display_overall_averages():
|
|
|
|
table = PrettyTable(["Metric"] + model_names)
|
|
|
|
|
|
|
|
# Overall average score (including zeros, excluding nulls)
|
|
|
|
row_with_zeros = ["Average Score (All Tasks)"]
|
|
|
|
for model in model_names:
|
|
|
|
valid_scores = [s for s in overall_scores[model] if s is not None]
|
|
|
|
avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0
|
|
|
|
row_with_zeros.append(round(avg, 2))
|
|
|
|
table.add_row(row_with_zeros)
|
|
|
|
|
|
|
|
# Overall average score (excluding zeros and nulls)
|
|
|
|
row_without_zeros = ["Average Score (Completed Tasks)"]
|
|
|
|
for model in model_names:
|
|
|
|
completed_scores = [s for s in overall_scores[model] if s is not None and s > 0]
|
|
|
|
avg = sum(completed_scores) / len(completed_scores) if completed_scores else 0
|
|
|
|
row_without_zeros.append(round(avg, 2))
|
|
|
|
table.add_row(row_without_zeros)
|
|
|
|
|
|
|
|
# Task completion rate
|
|
|
|
completion_row = ["Task Completion Rate (%)"]
|
|
|
|
for model in model_names:
|
|
|
|
completion_row.append(round(model_completion_rates[model] * 100, 2))
|
|
|
|
table.add_row(completion_row)
|
|
|
|
|
|
|
|
# Total number of tasks (excluding nulls)
|
|
|
|
task_count_row = ["Total Tasks"]
|
|
|
|
for model in model_names:
|
|
|
|
valid_tasks = [task for task in all_task_scores.keys() if model in all_task_scores[task] and all_task_scores[task][model] is not None]
|
|
|
|
task_count_row.append(len(valid_tasks))
|
|
|
|
table.add_row(task_count_row)
|
|
|
|
|
|
|
|
print("\nOverall Performance Metrics")
|
|
|
|
print(table)
|
|
|
|
|
|
|
|
display_overall_averages() # Display overall averages first
|
2025-03-22 21:36:57 -07:00
|
|
|
display_task_scores()
|
2025-03-23 14:23:10 -07:00
|
|
|
display_zero_and_null_score_tasks()
|
|
|
|
display_table("Average Success Score by Material", avg_material_scores)
|
|
|
|
display_table("Average Success Score by Room", avg_room_scores)
|
|
|
|
display_table("Average Success Score by (Material, Room) Tuples", avg_material_room_scores, tuple_keys=True)
|
2025-03-22 21:36:57 -07:00
|
|
|
|
2025-03-23 14:23:10 -07:00
|
|
|
# Example usage
|
|
|
|
folders = ["experiments/gpt-4o_construction_tasks", "experiments/exp_03-23_12-31"]
|
|
|
|
model_names = ["GPT-4o","Claude 3.5 sonnet"]
|
|
|
|
extract_success_scores(folders, model_names)
|