mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-08-28 18:03:03 +02:00
Merge pull request #599 from uukelele-scratch/pollinations-support
TTS for OpenAI + Gemini, Update Gemini SDK
This commit is contained in:
commit
d489cae49d
9 changed files with 260 additions and 115 deletions
|
@ -143,6 +143,12 @@ You can pass a string or an object for these fields. A model object must specify
|
||||||
"api": "openai",
|
"api": "openai",
|
||||||
"url": "https://api.openai.com/v1/",
|
"url": "https://api.openai.com/v1/",
|
||||||
"model": "text-embedding-ada-002"
|
"model": "text-embedding-ada-002"
|
||||||
|
},
|
||||||
|
"speak_model": {
|
||||||
|
"api": "openai",
|
||||||
|
"url": "https://api.openai.com/v1/",
|
||||||
|
"model": "tts-1",
|
||||||
|
"voice": "echo"
|
||||||
}
|
}
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@anthropic-ai/sdk": "^0.17.1",
|
"@anthropic-ai/sdk": "^0.17.1",
|
||||||
|
"@google/genai": "^1.15.0",
|
||||||
"@cerebras/cerebras_cloud_sdk": "^1.46.0",
|
"@cerebras/cerebras_cloud_sdk": "^1.46.0",
|
||||||
"@google/generative-ai": "^0.2.1",
|
|
||||||
"@huggingface/inference": "^2.8.1",
|
"@huggingface/inference": "^2.8.1",
|
||||||
"@mistralai/mistralai": "^1.1.0",
|
"@mistralai/mistralai": "^1.1.0",
|
||||||
"canvas": "^3.1.0",
|
"canvas": "^3.1.0",
|
||||||
|
@ -11,6 +11,8 @@
|
||||||
"express": "^4.18.2",
|
"express": "^4.18.2",
|
||||||
"google-translate-api-x": "^10.7.1",
|
"google-translate-api-x": "^10.7.1",
|
||||||
"groq-sdk": "^0.15.0",
|
"groq-sdk": "^0.15.0",
|
||||||
|
"install": "^0.13.0",
|
||||||
|
"lamejs": "^1.2.1",
|
||||||
"minecraft-data": "^3.78.0",
|
"minecraft-data": "^3.78.0",
|
||||||
"mineflayer": "^4.29.0",
|
"mineflayer": "^4.29.0",
|
||||||
"mineflayer-armor-manager": "^2.0.1",
|
"mineflayer-armor-manager": "^2.0.1",
|
||||||
|
@ -19,8 +21,8 @@
|
||||||
"mineflayer-pathfinder": "^2.4.5",
|
"mineflayer-pathfinder": "^2.4.5",
|
||||||
"mineflayer-pvp": "^1.3.2",
|
"mineflayer-pvp": "^1.3.2",
|
||||||
"node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
|
"node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
|
||||||
|
"npm": "^11.5.2",
|
||||||
"openai": "^4.4.0",
|
"openai": "^4.4.0",
|
||||||
"patch-package": "^8.0.0",
|
|
||||||
"prismarine-item": "^1.15.0",
|
"prismarine-item": "^1.15.0",
|
||||||
"prismarine-viewer": "^1.32.0",
|
"prismarine-viewer": "^1.32.0",
|
||||||
"replicate": "^0.29.4",
|
"replicate": "^0.29.4",
|
||||||
|
@ -39,6 +41,7 @@
|
||||||
"@eslint/js": "^9.13.0",
|
"@eslint/js": "^9.13.0",
|
||||||
"eslint": "^9.13.0",
|
"eslint": "^9.13.0",
|
||||||
"eslint-plugin-no-floating-promise": "^2.0.0",
|
"eslint-plugin-no-floating-promise": "^2.0.0",
|
||||||
"globals": "^15.11.0"
|
"globals": "^15.11.0",
|
||||||
|
"patch-package": "^8.0.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
21
patches/lamejs+1.2.1.patch
Normal file
21
patches/lamejs+1.2.1.patch
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
diff --git a/node_modules/lamejs/lame.all.js b/node_modules/lamejs/lame.all.js
|
||||||
|
index bfd3637..b905508 100644
|
||||||
|
--- a/node_modules/lamejs/lame.all.js
|
||||||
|
+++ b/node_modules/lamejs/lame.all.js
|
||||||
|
@@ -1,4 +1,3 @@
|
||||||
|
-function lamejs() {
|
||||||
|
function new_byte(count) {
|
||||||
|
return new Int8Array(count);
|
||||||
|
}
|
||||||
|
@@ -15511,8 +15510,9 @@ WavHeader.readHeader = function (dataView) {
|
||||||
|
|
||||||
|
L3Side.SFBMAX = (Encoder.SBMAX_s * 3);
|
||||||
|
//testFullLength();
|
||||||
|
+export var lamejs = {}
|
||||||
|
lamejs.Mp3Encoder = Mp3Encoder;
|
||||||
|
lamejs.WavHeader = WavHeader;
|
||||||
|
-}
|
||||||
|
+
|
||||||
|
//fs=require('fs');
|
||||||
|
-lamejs();
|
||||||
|
+//lamejs();
|
|
@ -11,6 +11,8 @@
|
||||||
|
|
||||||
"image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note: the sky is always blue regardless of weather or time, dropped items are small pink cubes, and blocks below y=0 do not render. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS",
|
"image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note: the sky is always blue regardless of weather or time, dropped items are small pink cubes, and blocks below y=0 do not render. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS",
|
||||||
|
|
||||||
|
"speak_model": "openai/tts-1/echo",
|
||||||
|
|
||||||
"modes": {
|
"modes": {
|
||||||
"self_preservation": true,
|
"self_preservation": true,
|
||||||
"unstuck": true,
|
"unstuck": true,
|
||||||
|
|
|
@ -28,7 +28,11 @@ const settings = {
|
||||||
"load_memory": false, // load memory from previous session
|
"load_memory": false, // load memory from previous session
|
||||||
"init_message": "Respond with hello world and your name", // sends to all on spawn
|
"init_message": "Respond with hello world and your name", // sends to all on spawn
|
||||||
"only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly
|
"only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly
|
||||||
"speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
|
|
||||||
|
"speak": true,
|
||||||
|
// allows all bots to speak through text-to-speech. format: {provider}/{model}/{voice}. if set to "system" it will use system text-to-speech, which works on windows and mac, but on linux you need to `apt install espeak`.
|
||||||
|
// specify speech model inside each profile - so that you can have each bot with different voices
|
||||||
|
|
||||||
"chat_ingame": true, // bot responses are shown in minecraft chat
|
"chat_ingame": true, // bot responses are shown in minecraft chat
|
||||||
"language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages
|
"language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages
|
||||||
"render_bot_view": false, // show bot's view in browser at localhost:3000, 3001...
|
"render_bot_view": false, // show bot's view in browser at localhost:3000, 3001...
|
||||||
|
|
|
@ -383,9 +383,9 @@ export class Agent {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (settings.speak) {
|
if (settings.speak) {
|
||||||
say(to_translate);
|
say(to_translate, this.prompter.profile.speak_model);
|
||||||
}
|
}
|
||||||
if (settings.chat_ingame) {this.bot.chat(message);}
|
if (settings.chat_ingame) {this.bot.chat(message);}
|
||||||
sendOutputToServer(this.name, message);
|
sendOutputToServer(this.name, message);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,43 +1,107 @@
|
||||||
import { exec } from 'child_process';
|
import { exec, spawn } from 'child_process';
|
||||||
|
import { TTSConfig as gptTTSConfig } from '../models/gpt.js';
|
||||||
|
import { TTSConfig as geminiTTSConfig } from '../models/gemini.js';
|
||||||
|
|
||||||
let speakingQueue = [];
|
let speakingQueue = [];
|
||||||
let isSpeaking = false;
|
let isSpeaking = false;
|
||||||
|
|
||||||
export function say(textToSpeak) {
|
export function say(text, speak_model) {
|
||||||
speakingQueue.push(textToSpeak);
|
speakingQueue.push([text, speak_model]);
|
||||||
if (!isSpeaking) {
|
if (!isSpeaking) processQueue();
|
||||||
processQueue();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function processQueue() {
|
async function processQueue() {
|
||||||
if (speakingQueue.length === 0) {
|
if (speakingQueue.length === 0) {
|
||||||
isSpeaking = false;
|
isSpeaking = false;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
isSpeaking = true;
|
isSpeaking = true;
|
||||||
const textToSpeak = speakingQueue.shift();
|
const [txt, speak_model] = speakingQueue.shift();
|
||||||
const isWin = process.platform === "win32";
|
|
||||||
const isMac = process.platform === "darwin";
|
|
||||||
|
|
||||||
let command;
|
const isWin = process.platform === 'win32';
|
||||||
|
const isMac = process.platform === 'darwin';
|
||||||
|
const model = speak_model || 'openai/tts-1/echo';
|
||||||
|
|
||||||
|
if (model === 'system') {
|
||||||
|
// system TTS
|
||||||
|
const cmd = isWin
|
||||||
|
? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \
|
||||||
|
$s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \
|
||||||
|
$s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
|
||||||
|
: isMac
|
||||||
|
? `say "${txt.replace(/"/g,'\\"')}"`
|
||||||
|
: `espeak "${txt.replace(/"/g,'\\"')}"`;
|
||||||
|
|
||||||
|
exec(cmd, err => {
|
||||||
|
if (err) console.error('TTS error', err);
|
||||||
|
processQueue();
|
||||||
|
});
|
||||||
|
|
||||||
if (isWin) {
|
|
||||||
command = `powershell -Command "Add-Type -AssemblyName System.Speech; $s = New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate = 2; $s.Speak(\\"${textToSpeak}\\"); $s.Dispose()"`;
|
|
||||||
} else if (isMac) {
|
|
||||||
command = `say "${textToSpeak}"`;
|
|
||||||
} else {
|
} else {
|
||||||
command = `espeak "${textToSpeak}"`;
|
|
||||||
}
|
|
||||||
|
|
||||||
exec(command, (error, stdout, stderr) => {
|
function getModelUrl(prov) {
|
||||||
if (error) {
|
if (prov === 'openai') {
|
||||||
console.error(`Error: ${error.message}`);
|
return gptTTSConfig.baseUrl;
|
||||||
console.error(`${error.stack}`);
|
} else if (prov === 'google') {
|
||||||
} else if (stderr) {
|
return geminiTTSConfig.baseUrl;
|
||||||
console.error(`Error: ${stderr}`);
|
} else {
|
||||||
|
// fallback
|
||||||
|
return 'https://api.openai.com/v1'
|
||||||
|
}
|
||||||
}
|
}
|
||||||
processQueue(); // Continue with the next message in the queue
|
|
||||||
});
|
// remote audio provider
|
||||||
|
let prov, mdl, voice, url;
|
||||||
|
if (typeof model === "string") {
|
||||||
|
[prov, mdl, voice] = model.split('/');
|
||||||
|
url = getModelUrl(prov);
|
||||||
|
} else {
|
||||||
|
prov = model.api;
|
||||||
|
mdl = model.model;
|
||||||
|
voice = model.voice;
|
||||||
|
url = model.url || getModelUrl(prov);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
let audioData;
|
||||||
|
if (prov === "openai") {
|
||||||
|
audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
|
||||||
|
} else if (prov === "google") {
|
||||||
|
audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
|
||||||
|
} else {
|
||||||
|
throw new Error(`TTS Provider ${prov} is not supported.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!audioData) {
|
||||||
|
throw new Error("TTS model did not return audio data");
|
||||||
|
// will be handled below
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isWin) {
|
||||||
|
const ps = `
|
||||||
|
Add-Type -AssemblyName presentationCore;
|
||||||
|
$p=New-Object System.Windows.Media.MediaPlayer;
|
||||||
|
$p.Open([Uri]::new("data:audio/mp3;base64,${audioData}"));
|
||||||
|
$p.Play();
|
||||||
|
Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds);
|
||||||
|
`;
|
||||||
|
spawn('powershell', ['-NoProfile','-Command', ps], {
|
||||||
|
stdio: 'ignore', detached: true
|
||||||
|
}).unref();
|
||||||
|
processQueue();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], {
|
||||||
|
stdio: ['pipe','ignore','ignore']
|
||||||
|
});
|
||||||
|
player.stdin.write(Buffer.from(audioData, 'base64'));
|
||||||
|
player.stdin.end();
|
||||||
|
player.on('exit', processQueue);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (e) {
|
||||||
|
console.error('[TTS] Audio error', e);
|
||||||
|
processQueue();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
import { GoogleGenerativeAI } from '@google/generative-ai';
|
import { GoogleGenAI } from '@google/genai';
|
||||||
import { toSinglePrompt, strictFormat } from '../utils/text.js';
|
import { strictFormat } from '../utils/text.js';
|
||||||
import { getKey } from '../utils/keys.js';
|
import { getKey } from '../utils/keys.js';
|
||||||
|
|
||||||
|
import { lamejs } from 'lamejs/lame.all.js';
|
||||||
|
|
||||||
|
|
||||||
export class Gemini {
|
export class Gemini {
|
||||||
static prefix = 'google';
|
static prefix = 'google';
|
||||||
constructor(model_name, url, params) {
|
constructor(model_name, url, params) {
|
||||||
this.model_name = model_name;
|
this.model_name = model_name;
|
||||||
this.params = params;
|
this.params = params;
|
||||||
this.url = url;
|
|
||||||
this.safetySettings = [
|
this.safetySettings = [
|
||||||
{
|
{
|
||||||
"category": "HARM_CATEGORY_DANGEROUS",
|
"category": "HARM_CATEGORY_DANGEROUS",
|
||||||
|
@ -31,31 +33,12 @@ export class Gemini {
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
this.genAI = new GoogleGenerativeAI(getKey('GEMINI_API_KEY'));
|
this.genAI = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')});
|
||||||
}
|
}
|
||||||
|
|
||||||
async sendRequest(turns, systemMessage) {
|
async sendRequest(turns, systemMessage) {
|
||||||
let model;
|
|
||||||
const modelConfig = {
|
|
||||||
model: this.model_name || "gemini-2.5-flash",
|
|
||||||
// systemInstruction does not work bc google is trash
|
|
||||||
};
|
|
||||||
if (this.url) {
|
|
||||||
model = this.genAI.getGenerativeModel(
|
|
||||||
modelConfig,
|
|
||||||
{ baseUrl: this.url },
|
|
||||||
{ safetySettings: this.safetySettings }
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
model = this.genAI.getGenerativeModel(
|
|
||||||
modelConfig,
|
|
||||||
{ safetySettings: this.safetySettings }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log('Awaiting Google API response...');
|
console.log('Awaiting Google API response...');
|
||||||
|
|
||||||
turns.unshift({ role: 'system', content: systemMessage });
|
|
||||||
turns = strictFormat(turns);
|
turns = strictFormat(turns);
|
||||||
let contents = [];
|
let contents = [];
|
||||||
for (let turn of turns) {
|
for (let turn of turns) {
|
||||||
|
@ -65,72 +48,58 @@ export class Gemini {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = await model.generateContent({
|
const result = await this.genAI.models.generateContent({
|
||||||
contents,
|
model: this.model_name || "gemini-2.5-flash",
|
||||||
generationConfig: {
|
contents: contents,
|
||||||
|
safetySettings: this.safetySettings,
|
||||||
|
config: {
|
||||||
|
systemInstruction: systemMessage,
|
||||||
...(this.params || {})
|
...(this.params || {})
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
const response = await result.response;
|
const response = await result.text;
|
||||||
let text;
|
|
||||||
|
|
||||||
// Handle "thinking" models since they smart
|
|
||||||
if (this.model_name && this.model_name.includes("thinking")) {
|
|
||||||
if (
|
|
||||||
response.candidates &&
|
|
||||||
response.candidates.length > 0 &&
|
|
||||||
response.candidates[0].content &&
|
|
||||||
response.candidates[0].content.parts &&
|
|
||||||
response.candidates[0].content.parts.length > 1
|
|
||||||
) {
|
|
||||||
text = response.candidates[0].content.parts[1].text;
|
|
||||||
} else {
|
|
||||||
console.warn("Unexpected response structure for thinking model:", response);
|
|
||||||
text = response.text();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
text = response.text();
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log('Received.');
|
console.log('Received.');
|
||||||
|
|
||||||
return text;
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
async sendVisionRequest(turns, systemMessage, imageBuffer) {
|
async sendVisionRequest(turns, systemMessage, imageBuffer) {
|
||||||
let model;
|
|
||||||
if (this.url) {
|
|
||||||
model = this.genAI.getGenerativeModel(
|
|
||||||
{ model: this.model_name || "gemini-1.5-flash" },
|
|
||||||
{ baseUrl: this.url },
|
|
||||||
{ safetySettings: this.safetySettings }
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
model = this.genAI.getGenerativeModel(
|
|
||||||
{ model: this.model_name || "gemini-1.5-flash" },
|
|
||||||
{ safetySettings: this.safetySettings }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const imagePart = {
|
const imagePart = {
|
||||||
inlineData: {
|
inlineData: {
|
||||||
data: imageBuffer.toString('base64'),
|
data: imageBuffer.toString('base64'),
|
||||||
mimeType: 'image/jpeg'
|
mimeType: 'image/jpeg'
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
turns = strictFormat(turns);
|
||||||
|
let contents = [];
|
||||||
|
for (let turn of turns) {
|
||||||
|
contents.push({
|
||||||
|
role: turn.role === 'assistant' ? 'model' : 'user',
|
||||||
|
parts: [{ text: turn.content }]
|
||||||
|
});
|
||||||
|
}
|
||||||
|
contents.push({
|
||||||
|
role: 'user',
|
||||||
|
parts: [{ text: 'SYSTEM: Vision response' }, imagePart]
|
||||||
|
})
|
||||||
|
|
||||||
const stop_seq = '***';
|
|
||||||
const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
|
|
||||||
let res = null;
|
let res = null;
|
||||||
try {
|
try {
|
||||||
console.log('Awaiting Google API vision response...');
|
console.log('Awaiting Google API vision response...');
|
||||||
const result = await model.generateContent([prompt, imagePart]);
|
const result = await this.genAI.models.generateContent({
|
||||||
const response = await result.response;
|
contents: contents,
|
||||||
const text = response.text();
|
safetySettings: this.safetySettings,
|
||||||
|
systemInstruction: systemMessage,
|
||||||
|
model: this.model,
|
||||||
|
config: {
|
||||||
|
systemInstruction: systemMessage,
|
||||||
|
...(this.params || {})
|
||||||
|
}
|
||||||
|
});
|
||||||
|
res = await result.text;
|
||||||
console.log('Received.');
|
console.log('Received.');
|
||||||
if (!text.includes(stop_seq)) return text;
|
|
||||||
const idx = text.indexOf(stop_seq);
|
|
||||||
res = text.slice(0, idx);
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.log(err);
|
console.log(err);
|
||||||
if (err.message.includes("Image input modality is not enabled for models/")) {
|
if (err.message.includes("Image input modality is not enabled for models/")) {
|
||||||
|
@ -143,19 +112,63 @@ export class Gemini {
|
||||||
}
|
}
|
||||||
|
|
||||||
async embed(text) {
|
async embed(text) {
|
||||||
let model = this.model_name || "text-embedding-004";
|
const result = await this.genAI.models.embedContent({
|
||||||
if (this.url) {
|
model: this.model_name || "gemini-embedding-001",
|
||||||
model = this.genAI.getGenerativeModel(
|
contents: text,
|
||||||
{ model },
|
})
|
||||||
{ baseUrl: this.url }
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
model = this.genAI.getGenerativeModel(
|
|
||||||
{ model }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const result = await model.embedContent(text);
|
return result.embeddings;
|
||||||
return result.embedding.values;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const sendAudioRequest = async (text, model, voice, url) => {
|
||||||
|
const ai = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')});
|
||||||
|
|
||||||
|
const response = await ai.models.generateContent({
|
||||||
|
model: model,
|
||||||
|
contents: [{ parts: [{text: text}] }],
|
||||||
|
config: {
|
||||||
|
responseModalities: ['AUDIO'],
|
||||||
|
speechConfig: {
|
||||||
|
voiceConfig: {
|
||||||
|
prebuiltVoiceConfig: { voiceName: voice },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
|
||||||
|
// data is base64-encoded pcm
|
||||||
|
|
||||||
|
// convert pcm to mp3
|
||||||
|
const SAMPLE_RATE = 24000;
|
||||||
|
const CHANNELS = 1;
|
||||||
|
const pcmBuffer = Buffer.from(data, 'base64');
|
||||||
|
const pcmInt16Array = new Int16Array(
|
||||||
|
pcmBuffer.buffer,
|
||||||
|
pcmBuffer.byteOffset,
|
||||||
|
pcmBuffer.length / 2
|
||||||
|
);
|
||||||
|
const mp3encoder = new lamejs.Mp3Encoder(CHANNELS, SAMPLE_RATE, 128);
|
||||||
|
const sampleBlockSize = 1152; // Standard for MPEG audio
|
||||||
|
const mp3Data = [];
|
||||||
|
for (let i = 0; i < pcmInt16Array.length; i += sampleBlockSize) {
|
||||||
|
const sampleChunk = pcmInt16Array.subarray(i, i + sampleBlockSize);
|
||||||
|
const mp3buf = mp3encoder.encodeBuffer(sampleChunk);
|
||||||
|
if (mp3buf.length > 0) {
|
||||||
|
mp3Data.push(Buffer.from(mp3buf));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const mp3buf = mp3encoder.flush();
|
||||||
|
if (mp3buf.length > 0) {
|
||||||
|
mp3Data.push(Buffer.from(mp3buf));
|
||||||
|
}
|
||||||
|
const finalBuffer = Buffer.concat(mp3Data);
|
||||||
|
// finished converting
|
||||||
|
|
||||||
|
return finalBuffer.toString('base64');
|
||||||
|
}
|
||||||
|
|
||||||
|
export const TTSConfig = {
|
||||||
|
sendAudioRequest: sendAudioRequest,
|
||||||
|
}
|
|
@ -90,3 +90,35 @@ export class GPT {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const sendAudioRequest = async (text, model, voice, url) => {
|
||||||
|
const payload = {
|
||||||
|
model: model,
|
||||||
|
voice: voice,
|
||||||
|
input: text
|
||||||
|
}
|
||||||
|
|
||||||
|
let audioData = null;
|
||||||
|
|
||||||
|
let config = {};
|
||||||
|
|
||||||
|
if (url)
|
||||||
|
config.baseURL = url;
|
||||||
|
|
||||||
|
if (hasKey('OPENAI_ORG_ID'))
|
||||||
|
config.organization = getKey('OPENAI_ORG_ID');
|
||||||
|
|
||||||
|
config.apiKey = getKey('OPENAI_API_KEY');
|
||||||
|
|
||||||
|
const openai = new OpenAIApi(config);
|
||||||
|
|
||||||
|
const mp3 = await openai.audio.speech.create(payload);
|
||||||
|
const buffer = Buffer.from(await mp3.arrayBuffer());
|
||||||
|
const base64 = buffer.toString("base64");
|
||||||
|
return base64;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const TTSConfig = {
|
||||||
|
sendAudioRequest: sendAudioRequest,
|
||||||
|
baseUrl: 'https://api.openai.com/v1',
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue