Merge pull request #599 from uukelele-scratch/pollinations-support

TTS for OpenAI + Gemini, Update Gemini SDK
2025-08-28 18:03:03 +02:00 · 2025-08-27 10:06:38 -05:00 · 2025-08-27 10:06:38 -05:00 · d489cae49d
commit d489cae49d
parent de03a8a25d e67bd9ab92
9 changed files with 260 additions and 115 deletions
--- a/README.md
+++ b/README.md
@ -143,6 +143,12 @@ You can pass a string or an object for these fields. A model object must specify
  "api": "openai",
  "url": "https://api.openai.com/v1/",
  "model": "text-embedding-ada-002"
 },
 "speak_model": {
  "api": "openai",
  "url": "https://api.openai.com/v1/",
  "model": "tts-1",
  "voice": "echo"
 }
 ```
--- a/package.json
+++ b/package.json
@ -2,8 +2,8 @@
    "type": "module",
    "dependencies": {
        "@anthropic-ai/sdk": "^0.17.1",
        "@google/genai": "^1.15.0",
        "@cerebras/cerebras_cloud_sdk": "^1.46.0",
        "@google/generative-ai": "^0.2.1",
        "@huggingface/inference": "^2.8.1",
        "@mistralai/mistralai": "^1.1.0",
        "canvas": "^3.1.0",
@ -11,6 +11,8 @@
        "express": "^4.18.2",
        "google-translate-api-x": "^10.7.1",
        "groq-sdk": "^0.15.0",
        "install": "^0.13.0",
        "lamejs": "^1.2.1",
        "minecraft-data": "^3.78.0",
        "mineflayer": "^4.29.0",
        "mineflayer-armor-manager": "^2.0.1",
@ -19,8 +21,8 @@
        "mineflayer-pathfinder": "^2.4.5",
        "mineflayer-pvp": "^1.3.2",
        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
        "npm": "^11.5.2",
        "openai": "^4.4.0",
        "patch-package": "^8.0.0",
        "prismarine-item": "^1.15.0",
        "prismarine-viewer": "^1.32.0",
        "replicate": "^0.29.4",
@ -39,6 +41,7 @@
        "@eslint/js": "^9.13.0",
        "eslint": "^9.13.0",
        "eslint-plugin-no-floating-promise": "^2.0.0",
-        "globals": "^15.11.0"
+        "globals": "^15.11.0",
        "patch-package": "^8.0.0"
    }
 }
--- a/patches/lamejs+1.2.1.patch
+++ b/patches/lamejs+1.2.1.patch
@ -0,0 +1,21 @@
 diff --git a/node_modules/lamejs/lame.all.js b/node_modules/lamejs/lame.all.js
 index bfd3637..b905508 100644
 --- a/node_modules/lamejs/lame.all.js
 +++ b/node_modules/lamejs/lame.all.js
@@ -1,4 +1,3 @@
 -function lamejs() {
 function new_byte(count) {
     return new Int8Array(count);
 }
@@ -15511,8 +15510,9 @@ WavHeader.readHeader = function (dataView) {
 L3Side.SFBMAX = (Encoder.SBMAX_s * 3);
 //testFullLength();
 +export var lamejs = {}
 lamejs.Mp3Encoder = Mp3Encoder;
 lamejs.WavHeader = WavHeader;
 -}
 +
 //fs=require('fs');
 -lamejs();
 +//lamejs();
--- a/profiles/defaults/_default.json
+++ b/profiles/defaults/_default.json
@ -11,6 +11,8 @@
    "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note: the sky is always blue regardless of weather or time, dropped items are small pink cubes, and blocks below y=0 do not render. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS",
    "speak_model": "openai/tts-1/echo",
    "modes": {
        "self_preservation": true,
        "unstuck": true,
--- a/settings.js
+++ b/settings.js
@ -28,7 +28,11 @@ const settings = {
    "load_memory": false, // load memory from previous session
    "init_message": "Respond with hello world and your name", // sends to all on spawn
    "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly
-    "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
+
    "speak": true,
    // allows all bots to speak through text-to-speech. format: {provider}/{model}/{voice}. if set to "system" it will use system text-to-speech, which works on windows and mac, but on linux you need to `apt install espeak`.
    // specify speech model inside each profile - so that you can have each bot with different voices
    "chat_ingame": true, // bot responses are shown in minecraft chat
    "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages
    "render_bot_view": false, // show bot's view in browser at localhost:3000, 3001...
--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@ -383,9 +383,9 @@ export class Agent {
            }
        }
        else {
-	    if (settings.speak) {
+            if (settings.speak) {
-            say(to_translate);
+                say(to_translate, this.prompter.profile.speak_model);
-	    }
+            }
            if (settings.chat_ingame) {this.bot.chat(message);}
            sendOutputToServer(this.name, message);
        }
--- a/src/agent/speak.js
+++ b/src/agent/speak.js
@ -1,43 +1,107 @@
-import { exec } from 'child_process';
+import { exec, spawn } from 'child_process';
 import { TTSConfig as gptTTSConfig } from '../models/gpt.js';
 import { TTSConfig as geminiTTSConfig } from '../models/gemini.js';
 let speakingQueue = [];
 let isSpeaking = false;
-export function say(textToSpeak) {
+export function say(text, speak_model) {
-  speakingQueue.push(textToSpeak);
+  speakingQueue.push([text, speak_model]);
-  if (!isSpeaking) {
+  if (!isSpeaking) processQueue();
    processQueue();
  }
 }
-function processQueue() {
+async function processQueue() {
  if (speakingQueue.length === 0) {
    isSpeaking = false;
    return;
  }
  isSpeaking = true;
-  const textToSpeak = speakingQueue.shift();
+  const [txt, speak_model] = speakingQueue.shift();
  const isWin = process.platform === "win32";
  const isMac = process.platform === "darwin";
-  let command;
+  const isWin = process.platform === 'win32';
  const isMac = process.platform === 'darwin';
  const model = speak_model || 'openai/tts-1/echo';
  if (model === 'system') {
    // system TTS
    const cmd = isWin
      ? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \
 $s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \
 $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
      : isMac
      ? `say "${txt.replace(/"/g,'\\"')}"`
      : `espeak "${txt.replace(/"/g,'\\"')}"`;
    exec(cmd, err => {
      if (err) console.error('TTS error', err);
      processQueue();
    });
  if (isWin) {
    command = `powershell -Command "Add-Type -AssemblyName System.Speech; $s = New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate = 2; $s.Speak(\\"${textToSpeak}\\"); $s.Dispose()"`;
  } else if (isMac) {
    command = `say "${textToSpeak}"`;
  } else {
    command = `espeak "${textToSpeak}"`;
  }
-  exec(command, (error, stdout, stderr) => {
+    function getModelUrl(prov) {
-    if (error) {
+      if (prov === 'openai') {
-      console.error(`Error: ${error.message}`);
+        return gptTTSConfig.baseUrl;
-      console.error(`${error.stack}`);
+      } else if (prov === 'google') {
-    } else if (stderr) {
+        return geminiTTSConfig.baseUrl;
-      console.error(`Error: ${stderr}`);
+      } else {
        // fallback
        return 'https://api.openai.com/v1'
      }
    }
-    processQueue(); // Continue with the next message in the queue
+
-  });
+    // remote audio provider
    let prov, mdl, voice, url;
    if (typeof model === "string") {
      [prov, mdl, voice] = model.split('/');
      url = getModelUrl(prov);
    } else {
      prov = model.api;
      mdl = model.model;
      voice = model.voice;
      url = model.url || getModelUrl(prov);
    }
    try {
      let audioData;
      if (prov === "openai") {
        audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
      } else if (prov === "google") {
        audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
      } else {
        throw new Error(`TTS Provider ${prov} is not supported.`);
      }
      if (!audioData) {
        throw new Error("TTS model did not return audio data");
        // will be handled below
      }
      if (isWin) {
        const ps = `
          Add-Type -AssemblyName presentationCore;
          $p=New-Object System.Windows.Media.MediaPlayer;
          $p.Open([Uri]::new("data:audio/mp3;base64,${audioData}"));
          $p.Play();
          Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds);
        `;
        spawn('powershell', ['-NoProfile','-Command', ps], {
          stdio: 'ignore', detached: true
        }).unref();
        processQueue();
      } else {
        const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], {
          stdio: ['pipe','ignore','ignore']
        });
        player.stdin.write(Buffer.from(audioData, 'base64'));
        player.stdin.end();
        player.on('exit', processQueue);
      }
    } catch (e) {
      console.error('[TTS] Audio error', e);
      processQueue();
    }
  }
 }
--- a/src/models/gemini.js
+++ b/src/models/gemini.js
@ -1,13 +1,15 @@
-import { GoogleGenerativeAI } from '@google/generative-ai';
+import { GoogleGenAI } from '@google/genai';
-import { toSinglePrompt, strictFormat } from '../utils/text.js';
+import { strictFormat } from '../utils/text.js';
 import { getKey } from '../utils/keys.js';
 import { lamejs } from 'lamejs/lame.all.js';
 export class Gemini {
    static prefix = 'google';
    constructor(model_name, url, params) {
        this.model_name = model_name;
        this.params = params;
        this.url = url;
        this.safetySettings = [
            {
                "category": "HARM_CATEGORY_DANGEROUS",
@ -31,31 +33,12 @@ export class Gemini {
            },
        ];
-        this.genAI = new GoogleGenerativeAI(getKey('GEMINI_API_KEY'));
+        this.genAI = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')});
    }
    async sendRequest(turns, systemMessage) {
        let model;
        const modelConfig = {
            model: this.model_name || "gemini-2.5-flash",
            // systemInstruction does not work bc google is trash
        };
        if (this.url) {
            model = this.genAI.getGenerativeModel(
                modelConfig,
                { baseUrl: this.url },
                { safetySettings: this.safetySettings }
            );
        } else {
            model = this.genAI.getGenerativeModel(
                modelConfig,
                { safetySettings: this.safetySettings }
            );
        }
        console.log('Awaiting Google API response...');
        turns.unshift({ role: 'system', content: systemMessage });
        turns = strictFormat(turns);
        let contents = [];
        for (let turn of turns) {
@ -65,72 +48,58 @@ export class Gemini {
            });
        }
-        const result = await model.generateContent({
+        const result = await this.genAI.models.generateContent({
-            contents,
+            model: this.model_name || "gemini-2.5-flash",
-            generationConfig: {
+            contents: contents,
            safetySettings: this.safetySettings,
            config: {
                systemInstruction: systemMessage,
                ...(this.params || {})
            }
        });
-        const response = await result.response;
+        const response = await result.text;
        let text;
        // Handle "thinking" models since they smart 
        if (this.model_name && this.model_name.includes("thinking")) {
            if (
                response.candidates &&
                response.candidates.length > 0 &&
                response.candidates[0].content &&
                response.candidates[0].content.parts &&
                response.candidates[0].content.parts.length > 1
            ) {
                text = response.candidates[0].content.parts[1].text;
            } else {
                console.warn("Unexpected response structure for thinking model:", response);
                text = response.text();
            }
        } else {
            text = response.text();
        }
        console.log('Received.');
-        return text;
+        return response;
    }
    async sendVisionRequest(turns, systemMessage, imageBuffer) {
        let model;
        if (this.url) {
            model = this.genAI.getGenerativeModel(
                { model: this.model_name || "gemini-1.5-flash" },
                { baseUrl: this.url },
                { safetySettings: this.safetySettings }
            );
        } else {
            model = this.genAI.getGenerativeModel(
                { model: this.model_name || "gemini-1.5-flash" },
                { safetySettings: this.safetySettings }
            );
        }
        const imagePart = {
            inlineData: {
                data: imageBuffer.toString('base64'),
                mimeType: 'image/jpeg'
            }
        };
        turns = strictFormat(turns);
        let contents = [];
        for (let turn of turns) {
            contents.push({
                role: turn.role === 'assistant' ? 'model' : 'user',
                parts: [{ text: turn.content }]
            });
        }
        contents.push({
            role: 'user',
            parts: [{ text: 'SYSTEM: Vision response' }, imagePart]
        })
        const stop_seq = '***';
        const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
        let res = null;
        try {
            console.log('Awaiting Google API vision response...');
-            const result = await model.generateContent([prompt, imagePart]);
+            const result = await this.genAI.models.generateContent({
-            const response = await result.response;
+                contents: contents,
-            const text = response.text();
+                safetySettings: this.safetySettings,
                systemInstruction: systemMessage,
                model: this.model,
                config: {
                    systemInstruction: systemMessage,
                    ...(this.params || {})
                }
            });
            res = await result.text;
            console.log('Received.');
            if (!text.includes(stop_seq)) return text;
            const idx = text.indexOf(stop_seq);
            res = text.slice(0, idx);
        } catch (err) {
            console.log(err);
            if (err.message.includes("Image input modality is not enabled for models/")) {
@ -143,19 +112,63 @@ export class Gemini {
    }
    async embed(text) {
-        let model = this.model_name || "text-embedding-004";
+        const result = await this.genAI.models.embedContent({
-        if (this.url) {
+            model: this.model_name || "gemini-embedding-001",
-            model = this.genAI.getGenerativeModel(
+            contents: text,
-                { model },
+        })
                { baseUrl: this.url }
            );
        } else {
            model = this.genAI.getGenerativeModel(
                { model }
            );
        }
-        const result = await model.embedContent(text);
+        return result.embeddings;
        return result.embedding.values;
    }
 }
 const sendAudioRequest = async (text, model, voice, url) => {
    const ai = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')});
    const response = await ai.models.generateContent({
        model: model,
        contents: [{ parts: [{text: text}] }],
        config: {
            responseModalities: ['AUDIO'],
            speechConfig: {
                voiceConfig: {
                    prebuiltVoiceConfig: { voiceName: voice },
                },
            },
        },
    })
    const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
    // data is base64-encoded pcm
    // convert pcm to mp3
    const SAMPLE_RATE = 24000;
    const CHANNELS = 1;
    const pcmBuffer = Buffer.from(data, 'base64');
    const pcmInt16Array = new Int16Array(
        pcmBuffer.buffer, 
        pcmBuffer.byteOffset, 
        pcmBuffer.length / 2
    );
    const mp3encoder = new lamejs.Mp3Encoder(CHANNELS, SAMPLE_RATE, 128);
    const sampleBlockSize = 1152; // Standard for MPEG audio
    const mp3Data = [];
    for (let i = 0; i < pcmInt16Array.length; i += sampleBlockSize) {
        const sampleChunk = pcmInt16Array.subarray(i, i + sampleBlockSize);
        const mp3buf = mp3encoder.encodeBuffer(sampleChunk);
        if (mp3buf.length > 0) {
            mp3Data.push(Buffer.from(mp3buf));
        }
    }
    const mp3buf = mp3encoder.flush();
    if (mp3buf.length > 0) {
        mp3Data.push(Buffer.from(mp3buf));
    }
    const finalBuffer = Buffer.concat(mp3Data);
    // finished converting
    return finalBuffer.toString('base64');
 }
 export const TTSConfig = {
    sendAudioRequest: sendAudioRequest,
 }
--- a/src/models/gpt.js
+++ b/src/models/gpt.js
@ -90,3 +90,35 @@ export class GPT {
    }
 }
 const sendAudioRequest = async (text, model, voice, url) => {
    const payload = {
        model: model,
        voice: voice,
        input: text
    }
    let audioData = null;
    let config = {};
    if (url)
        config.baseURL = url;
    if (hasKey('OPENAI_ORG_ID'))
        config.organization = getKey('OPENAI_ORG_ID');
    config.apiKey = getKey('OPENAI_API_KEY');
    const openai = new OpenAIApi(config);
    const mp3 = await openai.audio.speech.create(payload);
    const buffer = Buffer.from(await mp3.arrayBuffer());
    const base64 = buffer.toString("base64");
    return base64;
 }
 export const TTSConfig = {
    sendAudioRequest: sendAudioRequest,
    baseUrl: 'https://api.openai.com/v1',
 }