diff --git a/evaluation_script.py b/evaluation_script.py new file mode 100644 index 0000000..4650d77 --- /dev/null +++ b/evaluation_script.py @@ -0,0 +1,147 @@ +import argparse +import json +import subprocess +import time +from datetime import datetime +import re + +def read_settings(file_path): + """Read and parse the settings.js file to get agent profiles.""" + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + + # Remove `export default` and trailing commas + content = re.sub(r'export\s+default', '', content) + content = re.sub(r',\s*(?=[}\]])', '', content) + + # Remove JavaScript comments + content = re.sub(r'//.*', '', content) + + # Remove trailing commas (e.g., before } or ]) + content = re.sub(r',\s*(?=[}\]])', '', content) + + # Strip leading and trailing whitespace + content = content.strip() + + json_data = json.loads(content) + + profiles = json_data['profiles'] + + ## profiles is a list of strings like "./andy.json" and "./bob.json" + + agent_names = [profile.split('/')[-1].split('.')[0] for profile in profiles] + return agent_names + +def check_task_completion(agents): + """Check memory.json files of all agents to determine task success/failure.""" + for agent in agents: + memory_path = f"bots/{agent}/memory.json" + try: + with open(memory_path, 'r') as f: + memory = json.load(f) + + # Check the last system message in turns + for turn in reversed(memory['turns']): + if turn['role'] == 'system' and 'code' in turn['content']: + # Extract completion code + if 'code : 2' in turn['content']: + return True # Task successful + elif 'code : 4' in turn['content']: + return False # Task failed + + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Error reading memory for agent {agent}: {e}") + continue + + return False # Default to failure if no conclusive result found + +def update_results_file(task_id, success_count, total_count, time_taken, experiment_results, results_filename): + """Update the results file with current success ratio and time taken.""" + success_ratio = success_count / total_count + + with open(results_filename, 'w') as f: # 'w' mode overwrites the file each time + f.write(f"Task ID: {task_id}\n") + f.write(f"Experiments completed: {total_count}\n") + f.write(f"Successful experiments: {success_count}\n") + f.write(f"Success ratio: {success_ratio:.2f}\n") + f.write(f"Time taken for last experiment: {time_taken:.2f} seconds\n") + + # Write individual experiment results + for i, result in enumerate(experiment_results, 1): + f.write(f"Experiment {i}: {'Success' if result['success'] else 'Failure'}, Time taken: {result['time_taken']:.2f} seconds\n") + + # Write aggregated metrics + total_time = sum(result['time_taken'] for result in experiment_results) + f.write(f"\nAggregated metrics:\n") + f.write(f"Total experiments: {total_count}\n") + f.write(f"Total successful experiments: {success_count}\n") + f.write(f"Overall success ratio: {success_ratio:.2f}\n") + f.write(f"Total time taken: {total_time:.2f} seconds\n") + f.write(f"Average time per experiment: {total_time / total_count:.2f} seconds\n") + f.write(f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + +def run_experiment(task_path, task_id, num_exp): + """Run the specified number of experiments and track results.""" + # Read agent profiles from settings.js + agents = read_settings(file_path="settings.js") + print(f"Detected agents: {agents}") + + # Generate timestamp at the start of experiments + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + results_filename = f"results_{task_id}_{timestamp}.txt" + print(f"Results will be saved to: {results_filename}") + + success_count = 0 + experiment_results = [] + + for exp_num in range(num_exp): + print(f"\nRunning experiment {exp_num + 1}/{num_exp}") + + start_time = time.time() + + # Run the node command + cmd = f"node main.js --task_path {task_path} --task_id {task_id}" + try: + subprocess.run(cmd, shell=True, check=True) + except subprocess.CalledProcessError as e: + print(f"Error running experiment: {e}") + continue + + # Check if task was successful + success = check_task_completion(agents) + if success: + success_count += 1 + print(f"Experiment {exp_num + 1} successful") + else: + print(f"Experiment {exp_num + 1} failed") + + end_time = time.time() + time_taken = end_time - start_time + + # Store individual experiment result + experiment_results.append({ + 'success': success, + 'time_taken': time_taken + }) + + # Update results file after each experiment using the constant filename + update_results_file(task_id, success_count, exp_num + 1, time_taken, experiment_results, results_filename) + + # Small delay between experiments + time.sleep(1) + + final_ratio = success_count / num_exp + print(f"\nExperiments completed. Final success ratio: {final_ratio:.2f}") + +def main(): + parser = argparse.ArgumentParser(description='Run Minecraft AI agent experiments') + parser.add_argument('task_path', help='Path to the task file') + parser.add_argument('task_id', help='ID of the task to run') + parser.add_argument('num_exp', type=int, help='Number of experiments to run') + + args = parser.parse_args() + + run_experiment(args.task_path, args.task_id, args.num_exp) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/agent/agent.js b/src/agent/agent.js index 72eb31d..27bf63e 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -447,6 +447,8 @@ export class Agent { if (this.task.data) { let res = this.task.isDone(); if (res) { + await this.history.add('system', `${res.message} ended with code : ${res.code}`); + await this.history.save(); console.log('Task finished:', res.message); this.killAll(); } diff --git a/src/agent/history.js b/src/agent/history.js index b6edf80..a578377 100644 --- a/src/agent/history.js +++ b/src/agent/history.js @@ -42,7 +42,7 @@ export class History { console.log("Memory updated to: ", this.memory); } - appendFullHistory(to_store) { + async appendFullHistory(to_store) { if (this.full_history_fp === undefined) { const string_timestamp = new Date().toLocaleString().replace(/[/:]/g, '-').replace(/ /g, '').replace(/,/g, '_'); this.full_history_fp = `./bots/${this.name}/histories/${string_timestamp}.json`; @@ -75,7 +75,7 @@ export class History { chunk.push(this.turns.shift()); // remove until turns starts with system/user message await this.summarizeMemories(chunk); - this.appendFullHistory(chunk); + await this.appendFullHistory(chunk); } }