From ba586127422ae5f369687189c9c115962334d144 Mon Sep 17 00:00:00 2001 From: Rishabh Bhargava Date: Mon, 27 Apr 2026 00:00:05 -0700 Subject: [PATCH 1/2] [MLE-5159] docs(audio-ws): correct response format from "WAV" to "Raw PCM" The Together WS endpoint streams raw PCM s16le samples with no RIFF/WAVE header, base64-wrapped per audio_output.delta event. The previous "WAV (PCM s16le)" claim led developers to write the bytes to a .wav file and find that no player accepts them (afplay, QuickTime, VLC all reject the file because there is no WAV magic). Updates the audio format description and the two code samples (Python, Node.js) to save to .pcm rather than .wav, matching the actual on-the-wire format. Co-Authored-By: Claude Opus 4.7 (1M context) --- openapi.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/openapi.yaml b/openapi.yaml index 0b20862..0ab6e72 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -3866,9 +3866,9 @@ paths: - The `input_text_buffer.commit` event is received **Audio Format:** - - Format: WAV (PCM s16le) + - Format: Raw PCM (s16le, mono) - Sample Rate: 24000 Hz - - Encoding: Base64 + - Encoding: Base64 (per delta event) - Delivered via `conversation.item.audio_output.delta` events **Error Codes:** @@ -3941,10 +3941,10 @@ paths: print(f"Error: {error.get('message')}") break - # Save the audio to a file - with open("output.wav", "wb") as f: + # Save the raw PCM samples to a file + with open("output.pcm", "wb") as f: f.write(audio_data) - print("Audio saved to output.wav") + print("Audio saved to output.pcm") # Run send and receive concurrently await asyncio.gather(send_text(), receive_audio()) @@ -4017,11 +4017,11 @@ paths: }); ws.on('close', () => { - // Save the audio to a file + // Save the raw PCM samples to a file if (audioData.length > 0) { const completeAudio = Buffer.concat(audioData); - fs.writeFileSync('output.wav', completeAudio); - console.log('Audio saved to output.wav'); + fs.writeFileSync('output.pcm', completeAudio); + console.log('Audio saved to output.pcm'); } }); From aaf69d22b1181ed6ab0d289475ee940af1bf7f3c Mon Sep 17 00:00:00 2001 From: Rishabh Bhargava Date: Mon, 27 Apr 2026 00:08:11 -0700 Subject: [PATCH 2/2] [MLE-5159] docs(audio-ws): fix voice in samples + guard session-failed event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The voice 'tara' belongs to Orpheus, not Kokoro. Kokoro's default voice 'af_heart' is the popular choice and exists in the catalog. Running the sample as written produced an immediate conversation.item.tts.failed (Voice 'tara' is not available for model 'hexgrad/Kokoro-82M'). The Python sample compounded that with an unconditional session_data['session']['id'] access on the first message — when the first message is tts.failed instead of session.created, that crashes with KeyError before any code can react. Added a guard so the sample fails gracefully with the actual error message. JS sample already gated on message.type === 'session.created' so no event-handling change is needed there. Verified end-to-end: with the fixes applied, the sample now writes 257012 bytes (≈ 5.35 s of raw PCM s16le @ 24 kHz mono) to output.pcm. ffmpeg wraps it cleanly and afplay plays it. Co-Authored-By: Claude Opus 4.7 (1M context) --- openapi.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/openapi.yaml b/openapi.yaml index 0ab6e72..b1d0958 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -3890,7 +3890,7 @@ paths: async def generate_speech(): api_key = os.environ.get("TOGETHER_API_KEY") - url = "wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=tara" + url = "wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=af_heart" headers = { "Authorization": f"Bearer {api_key}" @@ -3900,6 +3900,9 @@ paths: # Wait for session created session_msg = await ws.recv() session_data = json.loads(session_msg) + if session_data.get("type") != "session.created": + print(f"Failed to start session: {session_data}") + return print(f"Session created: {session_data['session']['id']}") # Send text for TTS @@ -3957,7 +3960,7 @@ paths: import fs from 'fs'; const apiKey = process.env.TOGETHER_API_KEY; - const url = 'wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=tara'; + const url = 'wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=af_heart'; const ws = new WebSocket(url, { headers: {