Adds STT functionality with audio upload

Adds speech-to-text (STT) functionality by allowing users to upload audio clips, start transcription jobs, and download transcriptions.

Introduces new API endpoints for STT upload, start, and download.

Also, converts AudioClip to WAV byte array.
This commit is contained in:
Jonas Luz Jr. 2025-11-26 19:31:12 -03:00
parent ea4535ebb6
commit e0daa00205
6 changed files with 447 additions and 47 deletions

View File

@ -1,6 +1,6 @@
using System;
using System.Collections;
using System.IO;
using System.Collections;
using System.Text;
using UnityEngine;
@ -56,12 +56,6 @@ namespace PPGIA.X540.Project3.API
request.downloadHandler = new DownloadHandlerBuffer();
// Debug.Log($"Sending {method} request to {url}");
// Debug.Log(
// payload != null ?
// $"Payload: {JsonUtility.ToJson(payload)}" :
// "No payload.");
var op = request.SendWebRequest();
yield return WaitForTimeout(op, timeoutInSeconds, () =>
{
@ -117,6 +111,170 @@ Response Body: {body}";
url, "DELETE", null, timeoutInSeconds, callbackOnSuccess);
}
internal static IEnumerator UploadAudioDataCoroutine(
string url,
string filePath,
float timeoutInSeconds,
Action<UnityWebRequest> callbackOnSuccess)
{
// PUT the audio data as binary
byte[] audioData = File.ReadAllBytes(filePath);
string fileName = Path.GetFileName(filePath);
using (UnityWebRequest request = UnityWebRequest.Put(url, audioData))
{
request.SetRequestHeader("Content-Type", "audio/wav");
var op = request.SendWebRequest();
yield return WaitForTimeout(op, timeoutInSeconds, () =>
{
Debug.LogError("Request timed out.");
});
if (request.result == UnityWebRequest.Result.Success)
{
callbackOnSuccess?.Invoke(request);
}
else
{
var body = request.downloadHandler?.text ?? string.Empty;
Debug.LogError($"Failed to upload audio data: {request.error} (HTTP {request.responseCode})\nBody: {body}");
}
}
}
internal static IEnumerator UploadAudioCoroutine(
string url,
AudioClip audioClip,
float timeoutInSeconds,
Action<UnityWebRequest> callbackOnSuccess)
{
// Convert AudioClip to WAV (PCM 16-bit little endian) without external utility.
byte[] audioData = AudioClipToWavBytes(audioClip);
string fileName = $"{audioClip.name}.wav";
string fieldName = "file";
yield return UploadFileCoroutine(
url, audioData, fileName, fieldName, timeoutInSeconds, callbackOnSuccess);
}
// Writes a WAV file header + PCM 16-bit data for the provided AudioClip.
// Supports mono or multi-channel clips. Assumes clip.samples * channels fits in int32.
private static byte[] AudioClipToWavBytes(AudioClip clip)
{
if (clip == null)
{
Debug.LogError("AudioClipToWavBytes: clip is null");
return Array.Empty<byte>();
}
int channels = clip.channels;
int sampleCount = clip.samples * channels; // total samples across channels
int sampleRate = clip.frequency;
// Get float data
float[] floatData = new float[sampleCount];
clip.GetData(floatData, 0);
// Convert to 16-bit PCM
// Each sample -> 2 bytes
byte[] pcmData = new byte[sampleCount * 2];
int pcmIndex = 0;
for (int i = 0; i < sampleCount; i++)
{
// Clamp just in case
float f = Mathf.Clamp(floatData[i], -1f, 1f);
short s = (short)Mathf.RoundToInt(f * 32767f);
pcmData[pcmIndex++] = (byte)(s & 0xFF); // little endian
pcmData[pcmIndex++] = (byte)((s >> 8) & 0xFF);
}
// WAV header size is 44 bytes
int headerSize = 44;
int fileSize = headerSize + pcmData.Length;
byte[] wav = new byte[fileSize];
// Helper local to write int/short little endian
void WriteInt32LE(int offset, int value)
{
wav[offset] = (byte)(value & 0xFF);
wav[offset + 1] = (byte)((value >> 8) & 0xFF);
wav[offset + 2] = (byte)((value >> 16) & 0xFF);
wav[offset + 3] = (byte)((value >> 24) & 0xFF);
}
void WriteInt16LE(int offset, short value)
{
wav[offset] = (byte)(value & 0xFF);
wav[offset + 1] = (byte)((value >> 8) & 0xFF);
}
// ChunkID "RIFF"
wav[0] = (byte)'R'; wav[1] = (byte)'I'; wav[2] = (byte)'F'; wav[3] = (byte)'F';
// ChunkSize = 36 + Subchunk2Size
int subchunk2Size = pcmData.Length; // NumSamples * NumChannels * BitsPerSample/8
WriteInt32LE(4, 36 + subchunk2Size);
// Format "WAVE"
wav[8] = (byte)'W'; wav[9] = (byte)'A'; wav[10] = (byte)'V'; wav[11] = (byte)'E';
// Subchunk1ID "fmt "
wav[12] = (byte)'f'; wav[13] = (byte)'m'; wav[14] = (byte)'t'; wav[15] = (byte)' ';
// Subchunk1Size (16 for PCM)
WriteInt32LE(16, 16);
// AudioFormat (1 = PCM)
WriteInt16LE(20, 1);
// NumChannels
WriteInt16LE(22, (short)channels);
// SampleRate
WriteInt32LE(24, sampleRate);
// ByteRate = SampleRate * NumChannels * BitsPerSample/8
int byteRate = sampleRate * channels * 2;
WriteInt32LE(28, byteRate);
// BlockAlign = NumChannels * BitsPerSample/8
WriteInt16LE(32, (short)(channels * 2));
// BitsPerSample
WriteInt16LE(34, 16);
// Subchunk2ID "data"
wav[36] = (byte)'d'; wav[37] = (byte)'a'; wav[38] = (byte)'t'; wav[39] = (byte)'a';
// Subchunk2Size
WriteInt32LE(40, subchunk2Size);
// Copy PCM data after header
Buffer.BlockCopy(pcmData, 0, wav, headerSize, pcmData.Length);
return wav;
}
internal static IEnumerator UploadFileCoroutine(
string url,
byte[] fileData,
string fileName,
string fieldName,
float timeoutInSeconds,
Action<UnityWebRequest> callbackOnSuccess)
{
WWWForm form = new WWWForm();
form.AddBinaryData(fieldName, fileData, fileName);
using (UnityWebRequest request =
UnityWebRequest.Post(url, form))
{
var op = request.SendWebRequest();
yield return WaitForTimeout(op, timeoutInSeconds, () =>
{
Debug.LogError("Request timed out.");
});
if (request.result == UnityWebRequest.Result.Success)
{
callbackOnSuccess?.Invoke(request);
}
else
{
Debug.LogError(
$"Error uploading file: {request.error}");
}
}
}
internal static IEnumerator DownloadAudioCoroutine(
string url,
float timeoutInSeconds,

View File

@ -1,4 +1,5 @@
using System;
using System.IO;
using System.Collections;
using System.Linq;
@ -7,7 +8,6 @@ using UnityEngine;
namespace PPGIA.X540.Project3.API
{
[RequireComponent(typeof(AudioSource))]
public class ApiClientManager : MonoBehaviour
{
#region -- Inspector Fields -------------------------------------------
@ -29,7 +29,7 @@ namespace PPGIA.X540.Project3.API
private string _sessionCloseEndpoint = "/session/close";
[SerializeField]
private string _chatEndpoint = "/chat/";
private string _chatEndpoint = "/chat";
[SerializeField]
private string _llmAgentEndpoint = "/agent/ask";
@ -38,7 +38,13 @@ namespace PPGIA.X540.Project3.API
private string _ttsEndpoint = "/tts/synthesize";
[SerializeField]
private string _sttEndpoint = "/stt/upload";
private string _sttUploadEndpoint = "/transcript/get-upload-url";
[SerializeField]
private string _sttStartEndpoint = "/transcript/start";
[SerializeField]
private string _sttDownloadEndpoint = "/transcript/download";
[Header("API Settings & Workload")]
[SerializeField]
@ -73,7 +79,10 @@ namespace PPGIA.X540.Project3.API
void Awake()
{
if (_audioSource == null)
_audioSource = GetComponent<AudioSource>();
if (_audioSource == null)
Debug.LogWarning("AudioSource component is missing.");
}
#region -- API Calls --------------------------------------------------
@ -131,6 +140,135 @@ namespace PPGIA.X540.Project3.API
}));
}
public void UploadAudioClip(
string localFilePath, Action<string> uploadCompletedCallback = null)
{
if (_session == null)
{
Debug.LogWarning("No active session. Please initiate a session first.");
return;
}
StopAllCoroutines();
var url = EndpointUrl(_sttUploadEndpoint, _session.SessionId);
var payload = new
{
filename = Path.GetFileName(localFilePath),
content_type = "audio/wav"
};
StartCoroutine(ApiClient.CallEndpointWithPostCoroutine(
url, _timeoutInSeconds, payload, (request) =>
{
var body = request.downloadHandler?.text ?? string.Empty;
var uploadUrl = JsonUtility.FromJson<STTUploadResponse>(body)?.UploadUrl;
var s3Key = JsonUtility.FromJson<STTUploadResponse>(body)?.S3Key;
if (uploadUrl == null)
{
Debug.LogWarning("Failed to get upload URL.");
return;
}
StartCoroutine(ApiClient.UploadAudioDataCoroutine(
uploadUrl, localFilePath, _timeoutInSeconds, (uploadRequest) =>
{
Debug.Log($"Audio upload complete: {uploadRequest.responseCode}");
uploadCompletedCallback?.Invoke(s3Key);
}));
}));
}
[ContextMenu("STT/Upload Audio Clip")]
public void StartTranscript(string s3Key,
Action<string> transcriptStartedCallback = null)
{
// Ensure there is an active session
if (_session == null)
{
Debug.LogWarning("No active session. Please initiate a session first.");
return;
}
if (string.IsNullOrEmpty(s3Key))
{
Debug.LogWarning("No file path provided for upload.");
return;
}
StopAllCoroutines();
// Build the endpoint URL
var url = EndpointUrl(_sttStartEndpoint);
var payload = new STTUploadResponse {
s3_key = s3Key
};
// Make the API call to upload the audio clip
StartCoroutine(ApiClient.CallEndpointWithPostCoroutine(
url, _timeoutInSeconds, payload, (request) =>
{
var body = request.downloadHandler?.text ?? string.Empty;
var response = ApiModel.FromJson<STTJobResponse>(body);
var jobName = response?.JobName;
Debug.Log($"Transcription job started: {jobName}");
transcriptStartedCallback?.Invoke(jobName);
}));
}
[ContextMenu("STT/Download Transcription")]
public void DownloadTranscription(string jobName,
Action<string> transcriptionReceivedCallback = null)
{
// Ensure there is an active session
if (_session == null)
{
Debug.LogWarning("No active session. Please initiate a session first.");
return;
}
StopAllCoroutines();
StartCoroutine(KeepCallingCoroutine(
EndpointUrl(_sttDownloadEndpoint, jobName), .5f,
transcriptionReceivedCallback
));
}
private IEnumerator KeepCallingCoroutine(string url,
float delayInSeconds, Action<string> callback)
{
// Make the API call to download the transcription
var wait = new WaitForSeconds(delayInSeconds);
bool keepCalling = true;
while (keepCalling)
{
yield return wait;
yield return ApiClient.CallEndpointWithGetCoroutine(
url, _timeoutInSeconds, (request) =>
{
var body = request.downloadHandler?.text ?? string.Empty;
var response = ApiModel.FromJson<STTJobResponse>(body);
if (response.Status == "FAILED")
{
keepCalling = false;
Debug.LogError("Transcription job failed.");
callback?.Invoke(null);
}
else if (response.Status == "COMPLETED")
{
keepCalling = false;
callback?.Invoke(response?.Transcript);
}
});
}
}
[ContextMenu("Chat/Send Message")]
public void SendChatMessage(string message = null,
Action<string> responseReceivedCallback = null,

View File

@ -48,6 +48,30 @@ namespace PPGIA.X540.Project3.API
public int ExpiresIn => expires_in;
}
[Serializable]
public class STTUploadResponse : ApiModel
{
public string upload_url;
public string s3_key;
public string UploadUrl => upload_url;
public string S3Key => s3_key;
}
[Serializable]
public class STTJobResponse : ApiModel
{
public string job_name;
public string s3_uri;
public string status;
public string transcript;
public string JobName => job_name;
public string S3Uri => s3_uri;
public string Status => status;
public string Transcript => transcript;
}
internal enum Environment
{
Development,

View File

@ -7,6 +7,10 @@ namespace PPGIA.X540.Project3
{
public class AppManager : MonoBehaviour
{
// Singleton instance
public static AppManager Instance { get; private set; }
#region -- Fields & Properties ----------------------------------------
[Header("References")]
[SerializeField]
private UIController _uiController;
@ -14,59 +18,129 @@ namespace PPGIA.X540.Project3
[SerializeField]
private ApiClientManager _apiManager;
[SerializeField]
private AudioCapture _audioCapture;
private AudioClip _recordedClip;
#endregion ------------------------------------------------------------
#region -- MonoBehaviour Methods --------------------------------------
private void Awake()
{
if (_uiController == null)
_uiController = GetComponent<UIController>();
if (_apiManager == null)
_apiManager = GetComponent<ApiClientManager>();
// Singleton pattern implementation
if (Instance == null)
{
Instance = this;
DontDestroyOnLoad(gameObject);
}
else
{
Destroy(gameObject);
return;
}
void Start()
if (_uiController == null)
{
_apiManager.CloseSession(
() => _uiController.SessionActive = _apiManager.IsSessionActive);
Debug.LogError("UIController reference is missing in AppManager.");
}
if (_apiManager == null)
{
Debug.LogError("ApiClientManager reference is missing in AppManager.");
}
if (_audioCapture == null)
{
Debug.LogError("AudioCapture reference is missing in AppManager.");
}
_uiController.OnTalkButtonClicked += HandleTalkButtonClicked;
_audioCapture.OnRecordingSaved += HandleClipSaved;
}
private void OnEnable()
{
_uiController.OnSessionButtonClicked += HandleSessionButtonClicked;
_uiController.OnSendChatButtonClicked += HandleSendChatButtonClicked;
_apiManager.InitiateSession(() =>
{
Debug.Log("API session initiated successfully.");
});
}
private void OnDisable()
void OnDisable()
{
_uiController.OnSessionButtonClicked -= HandleSessionButtonClicked;
_uiController.OnSendChatButtonClicked -= HandleSendChatButtonClicked;
}
private void HandleSessionButtonClicked()
if (_apiManager != null && _apiManager.IsSessionActive)
{
if (!_apiManager.IsSessionActive)
_apiManager.CloseSession(() =>
{
_apiManager.InitiateSession(
() => _uiController.SessionActive = _apiManager.IsSessionActive);
}
else
{
_apiManager.CloseSession(
() => _uiController.SessionActive = _apiManager.IsSessionActive
);
}
}
private void HandleSendChatButtonClicked(string message)
{
_apiManager.SendChatMessage(message,
(responseMessage) =>
{
_uiController.ChatOutput += $"User: {message}\n";
_uiController.ChatOutput += $"Bot: {responseMessage}\n";
},
() =>
{
// Speech finished callback (optional)
Debug.Log("API session closed successfully.");
});
}
}
private void OnDestroy()
{
_uiController.OnTalkButtonClicked -= HandleTalkButtonClicked;
_audioCapture.OnRecordingSaved -= HandleClipSaved;
}
#endregion ------------------------------------------------------------
private void HandleClipSaved(string filePath)
{
Debug.Log($"Audio clip saved at: {filePath}");
_apiManager.UploadAudioClip(
filePath,
(s3Key) =>
{
Debug.Log($"Clip uploaded to: {s3Key}");
_apiManager.StartTranscript(
s3Key,
(jobName) =>
{
Debug.Log($"Transcription job started: {jobName}");
_apiManager.DownloadTranscription(jobName, (transcript) =>
{
Debug.Log($"Transcription completed: {transcript}");
_uiController.AppendChatOutput($"\nUser: {transcript}\n");
_apiManager.SendChatMessage(transcript,
(response) =>
{
_uiController.AppendChatOutput($"Bot: {response}\n");
}, () =>
{
// Speech synthesis finished.
});
});
});
});
_uiController.CurrentState = UIController.UIState.Idle;
}
private void HandleTalkButtonClicked()
{
if (!_apiManager.IsSessionActive)
{
Debug.LogWarning("Session is not active. Cannot send message.");
return;
}
switch (_uiController.CurrentState)
{
case UIController.UIState.Idle:
_audioCapture.StartRecording();
_uiController.CurrentState = UIController.UIState.Recording;
break;
case UIController.UIState.Recording:
_audioCapture.StopRecording();
_uiController.CurrentState = UIController.UIState.Idle;
break;
case UIController.UIState.Processing:
Debug.Log("Currently processing. Please wait.");
break;
}
}
}
}

View File

@ -18,6 +18,7 @@ namespace PPGIA.X540.Project3
Hz96000 = 96000
}
#region -- Fields & Properties ----------------------------------------
[Header("Audio Capture Settings")]
[SerializeField]
private SampleRate _sampleRateInHz = SampleRate.Hz44100;
@ -54,9 +55,14 @@ namespace PPGIA.X540.Project3
private List<short> _capturedSamples =
new List<short>(1024 * 32); // filled only on Stop
private int _channels = 1; // microphone channel count (Unity usually mono)
private AudioClip _recordingClip;
private string _currentDevice;
private AudioClip _recordingClip;
public AudioClip GetRecordedClip() => _recordingClip;
private string _currentDevice;
#endregion ------------------------------------------------------------
#region -- MonoBehaviour Methods --------------------------------------
private void Awake()
{
_audioSource = GetComponent<AudioSource>();
@ -67,6 +73,7 @@ namespace PPGIA.X540.Project3
{
StopRecording();
}
#endregion ------------------------------------------------------------
[ContextMenu("Start recording audio")]
public void StartRecording()
@ -215,7 +222,6 @@ namespace PPGIA.X540.Project3
writer.Write(dataBytes);
}
LastSavedFilePath = filePath;
Debug.Log($"Audio saved to: {filePath}");
OnRecordingSaved?.Invoke(filePath);
}
catch (Exception ex)