diff --git a/README.md b/README.md index abd5e98..6b909f8 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,12 @@ You can pass a string or an object for these fields. A model object must specify "api": "openai", "url": "https://api.openai.com/v1/", "model": "text-embedding-ada-002" +}, +"speak_model": { + "api": "openai", + "url": "https://api.openai.com/v1/", + "model": "tts-1", + "voice": "echo" } ``` diff --git a/package.json b/package.json index 106a75c..4299df0 100644 --- a/package.json +++ b/package.json @@ -2,8 +2,8 @@ "type": "module", "dependencies": { "@anthropic-ai/sdk": "^0.17.1", + "@google/genai": "^1.15.0", "@cerebras/cerebras_cloud_sdk": "^1.46.0", - "@google/generative-ai": "^0.2.1", "@huggingface/inference": "^2.8.1", "@mistralai/mistralai": "^1.1.0", "canvas": "^3.1.0", @@ -11,6 +11,8 @@ "express": "^4.18.2", "google-translate-api-x": "^10.7.1", "groq-sdk": "^0.15.0", + "install": "^0.13.0", + "lamejs": "^1.2.1", "minecraft-data": "^3.78.0", "mineflayer": "^4.29.0", "mineflayer-armor-manager": "^2.0.1", @@ -19,8 +21,8 @@ "mineflayer-pathfinder": "^2.4.5", "mineflayer-pvp": "^1.3.2", "node-canvas-webgl": "PrismarineJS/node-canvas-webgl", + "npm": "^11.5.2", "openai": "^4.4.0", - "patch-package": "^8.0.0", "prismarine-item": "^1.15.0", "prismarine-viewer": "^1.32.0", "replicate": "^0.29.4", @@ -39,6 +41,7 @@ "@eslint/js": "^9.13.0", "eslint": "^9.13.0", "eslint-plugin-no-floating-promise": "^2.0.0", - "globals": "^15.11.0" + "globals": "^15.11.0", + "patch-package": "^8.0.0" } } diff --git a/patches/lamejs+1.2.1.patch b/patches/lamejs+1.2.1.patch new file mode 100644 index 0000000..8950598 --- /dev/null +++ b/patches/lamejs+1.2.1.patch @@ -0,0 +1,21 @@ +diff --git a/node_modules/lamejs/lame.all.js b/node_modules/lamejs/lame.all.js +index bfd3637..b905508 100644 +--- a/node_modules/lamejs/lame.all.js ++++ b/node_modules/lamejs/lame.all.js +@@ -1,4 +1,3 @@ +-function lamejs() { + function new_byte(count) { + return new Int8Array(count); + } +@@ -15511,8 +15510,9 @@ WavHeader.readHeader = function (dataView) { + + L3Side.SFBMAX = (Encoder.SBMAX_s * 3); + //testFullLength(); ++export var lamejs = {} + lamejs.Mp3Encoder = Mp3Encoder; + lamejs.WavHeader = WavHeader; +-} ++ + //fs=require('fs'); +-lamejs(); ++//lamejs(); diff --git a/profiles/defaults/_default.json b/profiles/defaults/_default.json index bf31d22..3d514ac 100644 --- a/profiles/defaults/_default.json +++ b/profiles/defaults/_default.json @@ -11,6 +11,8 @@ "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note: the sky is always blue regardless of weather or time, dropped items are small pink cubes, and blocks below y=0 do not render. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS", + "speak_model": "openai/tts-1/echo", + "modes": { "self_preservation": true, "unstuck": true, diff --git a/settings.js b/settings.js index 13c6cf7..fe37d4d 100644 --- a/settings.js +++ b/settings.js @@ -28,7 +28,11 @@ const settings = { "load_memory": false, // load memory from previous session "init_message": "Respond with hello world and your name", // sends to all on spawn "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly - "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak` + + "speak": true, + // allows all bots to speak through text-to-speech. format: {provider}/{model}/{voice}. if set to "system" it will use system text-to-speech, which works on windows and mac, but on linux you need to `apt install espeak`. + // specify speech model inside each profile - so that you can have each bot with different voices + "chat_ingame": true, // bot responses are shown in minecraft chat "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages "render_bot_view": false, // show bot's view in browser at localhost:3000, 3001... diff --git a/src/agent/agent.js b/src/agent/agent.js index 4c4072b..d69433d 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -383,9 +383,9 @@ export class Agent { } } else { - if (settings.speak) { - say(to_translate); - } + if (settings.speak) { + say(to_translate, this.prompter.profile.speak_model); + } if (settings.chat_ingame) {this.bot.chat(message);} sendOutputToServer(this.name, message); } diff --git a/src/agent/speak.js b/src/agent/speak.js index e5fe658..a68735b 100644 --- a/src/agent/speak.js +++ b/src/agent/speak.js @@ -1,43 +1,107 @@ -import { exec } from 'child_process'; +import { exec, spawn } from 'child_process'; +import { TTSConfig as gptTTSConfig } from '../models/gpt.js'; +import { TTSConfig as geminiTTSConfig } from '../models/gemini.js'; let speakingQueue = []; let isSpeaking = false; -export function say(textToSpeak) { - speakingQueue.push(textToSpeak); - if (!isSpeaking) { - processQueue(); - } +export function say(text, speak_model) { + speakingQueue.push([text, speak_model]); + if (!isSpeaking) processQueue(); } -function processQueue() { +async function processQueue() { if (speakingQueue.length === 0) { isSpeaking = false; return; } - isSpeaking = true; - const textToSpeak = speakingQueue.shift(); - const isWin = process.platform === "win32"; - const isMac = process.platform === "darwin"; + const [txt, speak_model] = speakingQueue.shift(); - let command; + const isWin = process.platform === 'win32'; + const isMac = process.platform === 'darwin'; + const model = speak_model || 'openai/tts-1/echo'; + + if (model === 'system') { + // system TTS + const cmd = isWin + ? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \ +$s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \ +$s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"` + : isMac + ? `say "${txt.replace(/"/g,'\\"')}"` + : `espeak "${txt.replace(/"/g,'\\"')}"`; + + exec(cmd, err => { + if (err) console.error('TTS error', err); + processQueue(); + }); - if (isWin) { - command = `powershell -Command "Add-Type -AssemblyName System.Speech; $s = New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate = 2; $s.Speak(\\"${textToSpeak}\\"); $s.Dispose()"`; - } else if (isMac) { - command = `say "${textToSpeak}"`; } else { - command = `espeak "${textToSpeak}"`; - } - exec(command, (error, stdout, stderr) => { - if (error) { - console.error(`Error: ${error.message}`); - console.error(`${error.stack}`); - } else if (stderr) { - console.error(`Error: ${stderr}`); + function getModelUrl(prov) { + if (prov === 'openai') { + return gptTTSConfig.baseUrl; + } else if (prov === 'google') { + return geminiTTSConfig.baseUrl; + } else { + // fallback + return 'https://api.openai.com/v1' + } } - processQueue(); // Continue with the next message in the queue - }); + + // remote audio provider + let prov, mdl, voice, url; + if (typeof model === "string") { + [prov, mdl, voice] = model.split('/'); + url = getModelUrl(prov); + } else { + prov = model.api; + mdl = model.model; + voice = model.voice; + url = model.url || getModelUrl(prov); + } + + try { + let audioData; + if (prov === "openai") { + audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url); + } else if (prov === "google") { + audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url); + } else { + throw new Error(`TTS Provider ${prov} is not supported.`); + } + + if (!audioData) { + throw new Error("TTS model did not return audio data"); + // will be handled below + } + + if (isWin) { + const ps = ` + Add-Type -AssemblyName presentationCore; + $p=New-Object System.Windows.Media.MediaPlayer; + $p.Open([Uri]::new("data:audio/mp3;base64,${audioData}")); + $p.Play(); + Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds); + `; + spawn('powershell', ['-NoProfile','-Command', ps], { + stdio: 'ignore', detached: true + }).unref(); + processQueue(); + + } else { + const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], { + stdio: ['pipe','ignore','ignore'] + }); + player.stdin.write(Buffer.from(audioData, 'base64')); + player.stdin.end(); + player.on('exit', processQueue); + } + + } catch (e) { + console.error('[TTS] Audio error', e); + processQueue(); + } + } } diff --git a/src/models/gemini.js b/src/models/gemini.js index ba24072..2d7bd33 100644 --- a/src/models/gemini.js +++ b/src/models/gemini.js @@ -1,13 +1,15 @@ -import { GoogleGenerativeAI } from '@google/generative-ai'; -import { toSinglePrompt, strictFormat } from '../utils/text.js'; +import { GoogleGenAI } from '@google/genai'; +import { strictFormat } from '../utils/text.js'; import { getKey } from '../utils/keys.js'; +import { lamejs } from 'lamejs/lame.all.js'; + + export class Gemini { static prefix = 'google'; constructor(model_name, url, params) { this.model_name = model_name; this.params = params; - this.url = url; this.safetySettings = [ { "category": "HARM_CATEGORY_DANGEROUS", @@ -31,31 +33,12 @@ export class Gemini { }, ]; - this.genAI = new GoogleGenerativeAI(getKey('GEMINI_API_KEY')); + this.genAI = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')}); } async sendRequest(turns, systemMessage) { - let model; - const modelConfig = { - model: this.model_name || "gemini-2.5-flash", - // systemInstruction does not work bc google is trash - }; - if (this.url) { - model = this.genAI.getGenerativeModel( - modelConfig, - { baseUrl: this.url }, - { safetySettings: this.safetySettings } - ); - } else { - model = this.genAI.getGenerativeModel( - modelConfig, - { safetySettings: this.safetySettings } - ); - } - console.log('Awaiting Google API response...'); - turns.unshift({ role: 'system', content: systemMessage }); turns = strictFormat(turns); let contents = []; for (let turn of turns) { @@ -65,72 +48,58 @@ export class Gemini { }); } - const result = await model.generateContent({ - contents, - generationConfig: { + const result = await this.genAI.models.generateContent({ + model: this.model_name || "gemini-2.5-flash", + contents: contents, + safetySettings: this.safetySettings, + config: { + systemInstruction: systemMessage, ...(this.params || {}) } }); - const response = await result.response; - let text; - - // Handle "thinking" models since they smart - if (this.model_name && this.model_name.includes("thinking")) { - if ( - response.candidates && - response.candidates.length > 0 && - response.candidates[0].content && - response.candidates[0].content.parts && - response.candidates[0].content.parts.length > 1 - ) { - text = response.candidates[0].content.parts[1].text; - } else { - console.warn("Unexpected response structure for thinking model:", response); - text = response.text(); - } - } else { - text = response.text(); - } + const response = await result.text; console.log('Received.'); - return text; + return response; } async sendVisionRequest(turns, systemMessage, imageBuffer) { - let model; - if (this.url) { - model = this.genAI.getGenerativeModel( - { model: this.model_name || "gemini-1.5-flash" }, - { baseUrl: this.url }, - { safetySettings: this.safetySettings } - ); - } else { - model = this.genAI.getGenerativeModel( - { model: this.model_name || "gemini-1.5-flash" }, - { safetySettings: this.safetySettings } - ); - } - const imagePart = { inlineData: { data: imageBuffer.toString('base64'), mimeType: 'image/jpeg' } }; + + turns = strictFormat(turns); + let contents = []; + for (let turn of turns) { + contents.push({ + role: turn.role === 'assistant' ? 'model' : 'user', + parts: [{ text: turn.content }] + }); + } + contents.push({ + role: 'user', + parts: [{ text: 'SYSTEM: Vision response' }, imagePart] + }) - const stop_seq = '***'; - const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model'); let res = null; try { console.log('Awaiting Google API vision response...'); - const result = await model.generateContent([prompt, imagePart]); - const response = await result.response; - const text = response.text(); + const result = await this.genAI.models.generateContent({ + contents: contents, + safetySettings: this.safetySettings, + systemInstruction: systemMessage, + model: this.model, + config: { + systemInstruction: systemMessage, + ...(this.params || {}) + } + }); + res = await result.text; console.log('Received.'); - if (!text.includes(stop_seq)) return text; - const idx = text.indexOf(stop_seq); - res = text.slice(0, idx); } catch (err) { console.log(err); if (err.message.includes("Image input modality is not enabled for models/")) { @@ -143,19 +112,63 @@ export class Gemini { } async embed(text) { - let model = this.model_name || "text-embedding-004"; - if (this.url) { - model = this.genAI.getGenerativeModel( - { model }, - { baseUrl: this.url } - ); - } else { - model = this.genAI.getGenerativeModel( - { model } - ); - } + const result = await this.genAI.models.embedContent({ + model: this.model_name || "gemini-embedding-001", + contents: text, + }) - const result = await model.embedContent(text); - return result.embedding.values; + return result.embeddings; } } + +const sendAudioRequest = async (text, model, voice, url) => { + const ai = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')}); + + const response = await ai.models.generateContent({ + model: model, + contents: [{ parts: [{text: text}] }], + config: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName: voice }, + }, + }, + }, + }) + + const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data; + // data is base64-encoded pcm + + // convert pcm to mp3 + const SAMPLE_RATE = 24000; + const CHANNELS = 1; + const pcmBuffer = Buffer.from(data, 'base64'); + const pcmInt16Array = new Int16Array( + pcmBuffer.buffer, + pcmBuffer.byteOffset, + pcmBuffer.length / 2 + ); + const mp3encoder = new lamejs.Mp3Encoder(CHANNELS, SAMPLE_RATE, 128); + const sampleBlockSize = 1152; // Standard for MPEG audio + const mp3Data = []; + for (let i = 0; i < pcmInt16Array.length; i += sampleBlockSize) { + const sampleChunk = pcmInt16Array.subarray(i, i + sampleBlockSize); + const mp3buf = mp3encoder.encodeBuffer(sampleChunk); + if (mp3buf.length > 0) { + mp3Data.push(Buffer.from(mp3buf)); + } + } + const mp3buf = mp3encoder.flush(); + if (mp3buf.length > 0) { + mp3Data.push(Buffer.from(mp3buf)); + } + const finalBuffer = Buffer.concat(mp3Data); + // finished converting + + return finalBuffer.toString('base64'); +} + +export const TTSConfig = { + sendAudioRequest: sendAudioRequest, +} \ No newline at end of file diff --git a/src/models/gpt.js b/src/models/gpt.js index ea7d600..142a766 100644 --- a/src/models/gpt.js +++ b/src/models/gpt.js @@ -90,3 +90,35 @@ export class GPT { } } + +const sendAudioRequest = async (text, model, voice, url) => { + const payload = { + model: model, + voice: voice, + input: text + } + + let audioData = null; + + let config = {}; + + if (url) + config.baseURL = url; + + if (hasKey('OPENAI_ORG_ID')) + config.organization = getKey('OPENAI_ORG_ID'); + + config.apiKey = getKey('OPENAI_API_KEY'); + + const openai = new OpenAIApi(config); + + const mp3 = await openai.audio.speech.create(payload); + const buffer = Buffer.from(await mp3.arrayBuffer()); + const base64 = buffer.toString("base64"); + return base64; +} + +export const TTSConfig = { + sendAudioRequest: sendAudioRequest, + baseUrl: 'https://api.openai.com/v1', +}