Merge pull request #599 from uukelele-scratch/pollinations-support

TTS for OpenAI + Gemini, Update Gemini SDK
2025-08-27 17:33:02 +02:00 · 2025-08-27 10:06:38 -05:00 · 2025-08-27 10:06:38 -05:00 · d489cae49d
commit d489cae49d
parent de03a8a25d e67bd9ab92
9 changed files with 260 additions and 115 deletions
--- a/README.md
+++ b/README.md
@ -143,6 +143,12 @@ You can pass a string or an object for these fields. A model object must specify
  "api": "openai",
  "url": "https://api.openai.com/v1/",
  "model": "text-embedding-ada-002"
+},
+"speak_model": {
+  "api": "openai",
+  "url": "https://api.openai.com/v1/",
+  "model": "tts-1",
+  "voice": "echo"
 }

 ```
--- a/package.json
+++ b/package.json
@ -2,8 +2,8 @@
    "type": "module",
    "dependencies": {
        "@anthropic-ai/sdk": "^0.17.1",
+        "@google/genai": "^1.15.0",
        "@cerebras/cerebras_cloud_sdk": "^1.46.0",
-        "@google/generative-ai": "^0.2.1",
        "@huggingface/inference": "^2.8.1",
        "@mistralai/mistralai": "^1.1.0",
        "canvas": "^3.1.0",
@ -11,6 +11,8 @@
        "express": "^4.18.2",
        "google-translate-api-x": "^10.7.1",
        "groq-sdk": "^0.15.0",
+        "install": "^0.13.0",
+        "lamejs": "^1.2.1",
        "minecraft-data": "^3.78.0",
        "mineflayer": "^4.29.0",
        "mineflayer-armor-manager": "^2.0.1",
@ -19,8 +21,8 @@
        "mineflayer-pathfinder": "^2.4.5",
        "mineflayer-pvp": "^1.3.2",
        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
+        "npm": "^11.5.2",
        "openai": "^4.4.0",
-        "patch-package": "^8.0.0",
        "prismarine-item": "^1.15.0",
        "prismarine-viewer": "^1.32.0",
        "replicate": "^0.29.4",
@ -39,6 +41,7 @@
        "@eslint/js": "^9.13.0",
        "eslint": "^9.13.0",
        "eslint-plugin-no-floating-promise": "^2.0.0",
-        "globals": "^15.11.0"
+        "globals": "^15.11.0",
+        "patch-package": "^8.0.0"
    }
 }
--- a/patches/lamejs+1.2.1.patch
+++ b/patches/lamejs+1.2.1.patch
@ -0,0 +1,21 @@
+diff --git a/node_modules/lamejs/lame.all.js b/node_modules/lamejs/lame.all.js
+index bfd3637..b905508 100644
+--- a/node_modules/lamejs/lame.all.js
+++ b/node_modules/lamejs/lame.all.js
+@@ -1,4 +1,3 @@
+-function lamejs() {
+ function new_byte(count) {
+     return new Int8Array(count);
+ }
+@@ -15511,8 +15510,9 @@ WavHeader.readHeader = function (dataView) {
+ 
+ L3Side.SFBMAX = (Encoder.SBMAX_s * 3);
+ //testFullLength();
+export var lamejs = {}
+ lamejs.Mp3Encoder = Mp3Encoder;
+ lamejs.WavHeader = WavHeader;
+-}
+
+ //fs=require('fs');
+-lamejs();
+//lamejs();
--- a/profiles/defaults/_default.json
+++ b/profiles/defaults/_default.json
@ -11,6 +11,8 @@

    "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note: the sky is always blue regardless of weather or time, dropped items are small pink cubes, and blocks below y=0 do not render. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS",

+    "speak_model": "openai/tts-1/echo",
+    
    "modes": {
        "self_preservation": true,
        "unstuck": true,
--- a/settings.js
+++ b/settings.js
@ -28,7 +28,11 @@ const settings = {
    "load_memory": false, // load memory from previous session
    "init_message": "Respond with hello world and your name", // sends to all on spawn
    "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly
-    "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
+
+    "speak": true,
+    // allows all bots to speak through text-to-speech. format: {provider}/{model}/{voice}. if set to "system" it will use system text-to-speech, which works on windows and mac, but on linux you need to `apt install espeak`.
+    // specify speech model inside each profile - so that you can have each bot with different voices
+
    "chat_ingame": true, // bot responses are shown in minecraft chat
    "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages
    "render_bot_view": false, // show bot's view in browser at localhost:3000, 3001...
--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@ -383,9 +383,9 @@ export class Agent {
            }
        }
        else {
-	    if (settings.speak) {
-            say(to_translate);
-	    }
+            if (settings.speak) {
+                say(to_translate, this.prompter.profile.speak_model);
+            }
            if (settings.chat_ingame) {this.bot.chat(message);}
            sendOutputToServer(this.name, message);
        }
--- a/src/agent/speak.js
+++ b/src/agent/speak.js
@ -1,43 +1,107 @@
-import { exec } from 'child_process';
+import { exec, spawn } from 'child_process';
+import { TTSConfig as gptTTSConfig } from '../models/gpt.js';
+import { TTSConfig as geminiTTSConfig } from '../models/gemini.js';

 let speakingQueue = [];
 let isSpeaking = false;

-export function say(textToSpeak) {
-  speakingQueue.push(textToSpeak);
-  if (!isSpeaking) {
-    processQueue();
-  }
+export function say(text, speak_model) {
+  speakingQueue.push([text, speak_model]);
+  if (!isSpeaking) processQueue();
 }

-function processQueue() {
+async function processQueue() {
  if (speakingQueue.length === 0) {
    isSpeaking = false;
    return;
  }
-
  isSpeaking = true;
-  const textToSpeak = speakingQueue.shift();
-  const isWin = process.platform === "win32";
-  const isMac = process.platform === "darwin";
+  const [txt, speak_model] = speakingQueue.shift();

-  let command;
+  const isWin = process.platform === 'win32';
+  const isMac = process.platform === 'darwin';
+  const model = speak_model || 'openai/tts-1/echo';
+
+  if (model === 'system') {
+    // system TTS
+    const cmd = isWin
+      ? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \
+$s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \
+$s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
+      : isMac
+      ? `say "${txt.replace(/"/g,'\\"')}"`
+      : `espeak "${txt.replace(/"/g,'\\"')}"`;
+
+    exec(cmd, err => {
+      if (err) console.error('TTS error', err);
+      processQueue();
+    });

-  if (isWin) {
-    command = `powershell -Command "Add-Type -AssemblyName System.Speech; $s = New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate = 2; $s.Speak(\\"${textToSpeak}\\"); $s.Dispose()"`;
-  } else if (isMac) {
-    command = `say "${textToSpeak}"`;
  } else {
-    command = `espeak "${textToSpeak}"`;
-  }

-  exec(command, (error, stdout, stderr) => {
-    if (error) {
-      console.error(`Error: ${error.message}`);
-      console.error(`${error.stack}`);
-    } else if (stderr) {
-      console.error(`Error: ${stderr}`);
+    function getModelUrl(prov) {
+      if (prov === 'openai') {
+        return gptTTSConfig.baseUrl;
+      } else if (prov === 'google') {
+        return geminiTTSConfig.baseUrl;
+      } else {
+        // fallback
+        return 'https://api.openai.com/v1'
+      }
    }
-    processQueue(); // Continue with the next message in the queue
-  });
+
+    // remote audio provider
+    let prov, mdl, voice, url;
+    if (typeof model === "string") {
+      [prov, mdl, voice] = model.split('/');
+      url = getModelUrl(prov);
+    } else {
+      prov = model.api;
+      mdl = model.model;
+      voice = model.voice;
+      url = model.url || getModelUrl(prov);
+    }
+
+    try {
+      let audioData;
+      if (prov === "openai") {
+        audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
+      } else if (prov === "google") {
+        audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
+      } else {
+        throw new Error(`TTS Provider ${prov} is not supported.`);
+      }
+      
+      if (!audioData) {
+        throw new Error("TTS model did not return audio data");
+        // will be handled below
+      }
+
+      if (isWin) {
+        const ps = `
+          Add-Type -AssemblyName presentationCore;
+          $p=New-Object System.Windows.Media.MediaPlayer;
+          $p.Open([Uri]::new("data:audio/mp3;base64,${audioData}"));
+          $p.Play();
+          Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds);
+        `;
+        spawn('powershell', ['-NoProfile','-Command', ps], {
+          stdio: 'ignore', detached: true
+        }).unref();
+        processQueue();
+
+      } else {
+        const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], {
+          stdio: ['pipe','ignore','ignore']
+        });
+        player.stdin.write(Buffer.from(audioData, 'base64'));
+        player.stdin.end();
+        player.on('exit', processQueue);
+      }
+
+    } catch (e) {
+      console.error('[TTS] Audio error', e);
+      processQueue();
+    }
+  }
 }
--- a/src/models/gemini.js
+++ b/src/models/gemini.js
@ -1,13 +1,15 @@
-import { GoogleGenerativeAI } from '@google/generative-ai';
-import { toSinglePrompt, strictFormat } from '../utils/text.js';
+import { GoogleGenAI } from '@google/genai';
+import { strictFormat } from '../utils/text.js';
 import { getKey } from '../utils/keys.js';

+import { lamejs } from 'lamejs/lame.all.js';
+
+
 export class Gemini {
    static prefix = 'google';
    constructor(model_name, url, params) {
        this.model_name = model_name;
        this.params = params;
-        this.url = url;
        this.safetySettings = [
            {
                "category": "HARM_CATEGORY_DANGEROUS",
@ -31,31 +33,12 @@ export class Gemini {
            },
        ];

-        this.genAI = new GoogleGenerativeAI(getKey('GEMINI_API_KEY'));
+        this.genAI = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')});
    }

    async sendRequest(turns, systemMessage) {
-        let model;
-        const modelConfig = {
-            model: this.model_name || "gemini-2.5-flash",
-            // systemInstruction does not work bc google is trash
-        };
-        if (this.url) {
-            model = this.genAI.getGenerativeModel(
-                modelConfig,
-                { baseUrl: this.url },
-                { safetySettings: this.safetySettings }
-            );
-        } else {
-            model = this.genAI.getGenerativeModel(
-                modelConfig,
-                { safetySettings: this.safetySettings }
-            );
-        }
-
        console.log('Awaiting Google API response...');

-        turns.unshift({ role: 'system', content: systemMessage });
        turns = strictFormat(turns);
        let contents = [];
        for (let turn of turns) {
@ -65,72 +48,58 @@ export class Gemini {
            });
        }

-        const result = await model.generateContent({
-            contents,
-            generationConfig: {
+        const result = await this.genAI.models.generateContent({
+            model: this.model_name || "gemini-2.5-flash",
+            contents: contents,
+            safetySettings: this.safetySettings,
+            config: {
+                systemInstruction: systemMessage,
                ...(this.params || {})
            }
        });
-        const response = await result.response;
-        let text;
-
-        // Handle "thinking" models since they smart 
-        if (this.model_name && this.model_name.includes("thinking")) {
-            if (
-                response.candidates &&
-                response.candidates.length > 0 &&
-                response.candidates[0].content &&
-                response.candidates[0].content.parts &&
-                response.candidates[0].content.parts.length > 1
-            ) {
-                text = response.candidates[0].content.parts[1].text;
-            } else {
-                console.warn("Unexpected response structure for thinking model:", response);
-                text = response.text();
-            }
-        } else {
-            text = response.text();
-        }
+        const response = await result.text;

        console.log('Received.');

-        return text;
+        return response;
    }

    async sendVisionRequest(turns, systemMessage, imageBuffer) {
-        let model;
-        if (this.url) {
-            model = this.genAI.getGenerativeModel(
-                { model: this.model_name || "gemini-1.5-flash" },
-                { baseUrl: this.url },
-                { safetySettings: this.safetySettings }
-            );
-        } else {
-            model = this.genAI.getGenerativeModel(
-                { model: this.model_name || "gemini-1.5-flash" },
-                { safetySettings: this.safetySettings }
-            );
-        }
-
        const imagePart = {
            inlineData: {
                data: imageBuffer.toString('base64'),
                mimeType: 'image/jpeg'
            }
        };
+       
+        turns = strictFormat(turns);
+        let contents = [];
+        for (let turn of turns) {
+            contents.push({
+                role: turn.role === 'assistant' ? 'model' : 'user',
+                parts: [{ text: turn.content }]
+            });
+        }
+        contents.push({
+            role: 'user',
+            parts: [{ text: 'SYSTEM: Vision response' }, imagePart]
+        })

-        const stop_seq = '***';
-        const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
        let res = null;
        try {
            console.log('Awaiting Google API vision response...');
-            const result = await model.generateContent([prompt, imagePart]);
-            const response = await result.response;
-            const text = response.text();
+            const result = await this.genAI.models.generateContent({
+                contents: contents,
+                safetySettings: this.safetySettings,
+                systemInstruction: systemMessage,
+                model: this.model,
+                config: {
+                    systemInstruction: systemMessage,
+                    ...(this.params || {})
+                }
+            });
+            res = await result.text;
            console.log('Received.');
-            if (!text.includes(stop_seq)) return text;
-            const idx = text.indexOf(stop_seq);
-            res = text.slice(0, idx);
        } catch (err) {
            console.log(err);
            if (err.message.includes("Image input modality is not enabled for models/")) {
@ -143,19 +112,63 @@ export class Gemini {
    }

    async embed(text) {
-        let model = this.model_name || "text-embedding-004";
-        if (this.url) {
-            model = this.genAI.getGenerativeModel(
-                { model },
-                { baseUrl: this.url }
-            );
-        } else {
-            model = this.genAI.getGenerativeModel(
-                { model }
-            );
-        }
+        const result = await this.genAI.models.embedContent({
+            model: this.model_name || "gemini-embedding-001",
+            contents: text,
+        })

-        const result = await model.embedContent(text);
-        return result.embedding.values;
+        return result.embeddings;
    }
 }
+
+const sendAudioRequest = async (text, model, voice, url) => {
+    const ai = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')});
+
+    const response = await ai.models.generateContent({
+        model: model,
+        contents: [{ parts: [{text: text}] }],
+        config: {
+            responseModalities: ['AUDIO'],
+            speechConfig: {
+                voiceConfig: {
+                    prebuiltVoiceConfig: { voiceName: voice },
+                },
+            },
+        },
+    })
+
+    const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
+    // data is base64-encoded pcm
+
+    // convert pcm to mp3
+    const SAMPLE_RATE = 24000;
+    const CHANNELS = 1;
+    const pcmBuffer = Buffer.from(data, 'base64');
+    const pcmInt16Array = new Int16Array(
+        pcmBuffer.buffer, 
+        pcmBuffer.byteOffset, 
+        pcmBuffer.length / 2
+    );
+    const mp3encoder = new lamejs.Mp3Encoder(CHANNELS, SAMPLE_RATE, 128);
+    const sampleBlockSize = 1152; // Standard for MPEG audio
+    const mp3Data = [];
+    for (let i = 0; i < pcmInt16Array.length; i += sampleBlockSize) {
+        const sampleChunk = pcmInt16Array.subarray(i, i + sampleBlockSize);
+        const mp3buf = mp3encoder.encodeBuffer(sampleChunk);
+        if (mp3buf.length > 0) {
+            mp3Data.push(Buffer.from(mp3buf));
+        }
+    }
+    const mp3buf = mp3encoder.flush();
+    if (mp3buf.length > 0) {
+        mp3Data.push(Buffer.from(mp3buf));
+    }
+    const finalBuffer = Buffer.concat(mp3Data);
+    // finished converting
+
+    return finalBuffer.toString('base64');
+}
+
+export const TTSConfig = {
+    sendAudioRequest: sendAudioRequest,
+}
--- a/src/models/gpt.js
+++ b/src/models/gpt.js
@ -90,3 +90,35 @@ export class GPT {
    }

 }
+
+const sendAudioRequest = async (text, model, voice, url) => {
+    const payload = {
+        model: model,
+        voice: voice,
+        input: text
+    }
+
+    let audioData = null;
+
+    let config = {};
+
+    if (url)
+        config.baseURL = url;
+
+    if (hasKey('OPENAI_ORG_ID'))
+        config.organization = getKey('OPENAI_ORG_ID');
+
+    config.apiKey = getKey('OPENAI_API_KEY');
+
+    const openai = new OpenAIApi(config);
+
+    const mp3 = await openai.audio.speech.create(payload);
+    const buffer = Buffer.from(await mp3.arrayBuffer());
+    const base64 = buffer.toString("base64");
+    return base64;
+}
+
+export const TTSConfig = {
+    sendAudioRequest: sendAudioRequest,
+    baseUrl: 'https://api.openai.com/v1',
+}