From e0daa0020534c232039149d49eb6c2cef80296f7 Mon Sep 17 00:00:00 2001 From: "Jonas Luz Jr." Date: Wed, 26 Nov 2025 19:31:12 -0300 Subject: [PATCH] Adds STT functionality with audio upload Adds speech-to-text (STT) functionality by allowing users to upload audio clips, start transcription jobs, and download transcriptions. Introduces new API endpoints for STT upload, start, and download. Also, converts AudioClip to WAV byte array. --- Assets/_Client/Scripts/API/ApiClient.cs | 172 +++++++++++++++++- .../_Client/Scripts/API/ApiClientManager.cs | 146 ++++++++++++++- Assets/_Client/Scripts/API/ApiModel.cs | 24 +++ Assets/_Client/Scripts/Core/AppManager.cs | 144 +++++++++++---- .../Scripts/{ => Core}/AudioCapture.cs | 8 +- .../Scripts/{ => Core}/AudioCapture.cs.meta | 0 6 files changed, 447 insertions(+), 47 deletions(-) rename Assets/_Client/Scripts/{ => Core}/AudioCapture.cs (96%) rename Assets/_Client/Scripts/{ => Core}/AudioCapture.cs.meta (100%) diff --git a/Assets/_Client/Scripts/API/ApiClient.cs b/Assets/_Client/Scripts/API/ApiClient.cs index 598a4bd..2f6cbee 100644 --- a/Assets/_Client/Scripts/API/ApiClient.cs +++ b/Assets/_Client/Scripts/API/ApiClient.cs @@ -1,6 +1,6 @@ using System; -using System.Collections; using System.IO; +using System.Collections; using System.Text; using UnityEngine; @@ -56,12 +56,6 @@ namespace PPGIA.X540.Project3.API request.downloadHandler = new DownloadHandlerBuffer(); - // Debug.Log($"Sending {method} request to {url}"); - // Debug.Log( - // payload != null ? - // $"Payload: {JsonUtility.ToJson(payload)}" : - // "No payload."); - var op = request.SendWebRequest(); yield return WaitForTimeout(op, timeoutInSeconds, () => { @@ -117,6 +111,170 @@ Response Body: {body}"; url, "DELETE", null, timeoutInSeconds, callbackOnSuccess); } + internal static IEnumerator UploadAudioDataCoroutine( + string url, + string filePath, + float timeoutInSeconds, + Action callbackOnSuccess) + { + // PUT the audio data as binary + byte[] audioData = File.ReadAllBytes(filePath); + string fileName = Path.GetFileName(filePath); + + using (UnityWebRequest request = UnityWebRequest.Put(url, audioData)) + { + request.SetRequestHeader("Content-Type", "audio/wav"); + + var op = request.SendWebRequest(); + yield return WaitForTimeout(op, timeoutInSeconds, () => + { + Debug.LogError("Request timed out."); + }); + + if (request.result == UnityWebRequest.Result.Success) + { + callbackOnSuccess?.Invoke(request); + } + else + { + var body = request.downloadHandler?.text ?? string.Empty; + Debug.LogError($"Failed to upload audio data: {request.error} (HTTP {request.responseCode})\nBody: {body}"); + } + } + } + + internal static IEnumerator UploadAudioCoroutine( + string url, + AudioClip audioClip, + float timeoutInSeconds, + Action callbackOnSuccess) + { + // Convert AudioClip to WAV (PCM 16-bit little endian) without external utility. + byte[] audioData = AudioClipToWavBytes(audioClip); + string fileName = $"{audioClip.name}.wav"; + string fieldName = "file"; + + yield return UploadFileCoroutine( + url, audioData, fileName, fieldName, timeoutInSeconds, callbackOnSuccess); + } + + // Writes a WAV file header + PCM 16-bit data for the provided AudioClip. + // Supports mono or multi-channel clips. Assumes clip.samples * channels fits in int32. + private static byte[] AudioClipToWavBytes(AudioClip clip) + { + if (clip == null) + { + Debug.LogError("AudioClipToWavBytes: clip is null"); + return Array.Empty(); + } + + int channels = clip.channels; + int sampleCount = clip.samples * channels; // total samples across channels + int sampleRate = clip.frequency; + + // Get float data + float[] floatData = new float[sampleCount]; + clip.GetData(floatData, 0); + + // Convert to 16-bit PCM + // Each sample -> 2 bytes + byte[] pcmData = new byte[sampleCount * 2]; + int pcmIndex = 0; + for (int i = 0; i < sampleCount; i++) + { + // Clamp just in case + float f = Mathf.Clamp(floatData[i], -1f, 1f); + short s = (short)Mathf.RoundToInt(f * 32767f); + pcmData[pcmIndex++] = (byte)(s & 0xFF); // little endian + pcmData[pcmIndex++] = (byte)((s >> 8) & 0xFF); + } + + // WAV header size is 44 bytes + int headerSize = 44; + int fileSize = headerSize + pcmData.Length; + byte[] wav = new byte[fileSize]; + + // Helper local to write int/short little endian + void WriteInt32LE(int offset, int value) + { + wav[offset] = (byte)(value & 0xFF); + wav[offset + 1] = (byte)((value >> 8) & 0xFF); + wav[offset + 2] = (byte)((value >> 16) & 0xFF); + wav[offset + 3] = (byte)((value >> 24) & 0xFF); + } + void WriteInt16LE(int offset, short value) + { + wav[offset] = (byte)(value & 0xFF); + wav[offset + 1] = (byte)((value >> 8) & 0xFF); + } + + // ChunkID "RIFF" + wav[0] = (byte)'R'; wav[1] = (byte)'I'; wav[2] = (byte)'F'; wav[3] = (byte)'F'; + // ChunkSize = 36 + Subchunk2Size + int subchunk2Size = pcmData.Length; // NumSamples * NumChannels * BitsPerSample/8 + WriteInt32LE(4, 36 + subchunk2Size); + // Format "WAVE" + wav[8] = (byte)'W'; wav[9] = (byte)'A'; wav[10] = (byte)'V'; wav[11] = (byte)'E'; + // Subchunk1ID "fmt " + wav[12] = (byte)'f'; wav[13] = (byte)'m'; wav[14] = (byte)'t'; wav[15] = (byte)' '; + // Subchunk1Size (16 for PCM) + WriteInt32LE(16, 16); + // AudioFormat (1 = PCM) + WriteInt16LE(20, 1); + // NumChannels + WriteInt16LE(22, (short)channels); + // SampleRate + WriteInt32LE(24, sampleRate); + // ByteRate = SampleRate * NumChannels * BitsPerSample/8 + int byteRate = sampleRate * channels * 2; + WriteInt32LE(28, byteRate); + // BlockAlign = NumChannels * BitsPerSample/8 + WriteInt16LE(32, (short)(channels * 2)); + // BitsPerSample + WriteInt16LE(34, 16); + // Subchunk2ID "data" + wav[36] = (byte)'d'; wav[37] = (byte)'a'; wav[38] = (byte)'t'; wav[39] = (byte)'a'; + // Subchunk2Size + WriteInt32LE(40, subchunk2Size); + + // Copy PCM data after header + Buffer.BlockCopy(pcmData, 0, wav, headerSize, pcmData.Length); + + return wav; + } + + internal static IEnumerator UploadFileCoroutine( + string url, + byte[] fileData, + string fileName, + string fieldName, + float timeoutInSeconds, + Action callbackOnSuccess) + { + WWWForm form = new WWWForm(); + form.AddBinaryData(fieldName, fileData, fileName); + + using (UnityWebRequest request = + UnityWebRequest.Post(url, form)) + { + var op = request.SendWebRequest(); + yield return WaitForTimeout(op, timeoutInSeconds, () => + { + Debug.LogError("Request timed out."); + }); + + if (request.result == UnityWebRequest.Result.Success) + { + callbackOnSuccess?.Invoke(request); + } + else + { + Debug.LogError( + $"Error uploading file: {request.error}"); + } + } + } + internal static IEnumerator DownloadAudioCoroutine( string url, float timeoutInSeconds, diff --git a/Assets/_Client/Scripts/API/ApiClientManager.cs b/Assets/_Client/Scripts/API/ApiClientManager.cs index 95bb5e5..8a05ead 100644 --- a/Assets/_Client/Scripts/API/ApiClientManager.cs +++ b/Assets/_Client/Scripts/API/ApiClientManager.cs @@ -1,4 +1,5 @@ using System; +using System.IO; using System.Collections; using System.Linq; @@ -7,7 +8,6 @@ using UnityEngine; namespace PPGIA.X540.Project3.API { - [RequireComponent(typeof(AudioSource))] public class ApiClientManager : MonoBehaviour { #region -- Inspector Fields ------------------------------------------- @@ -29,7 +29,7 @@ namespace PPGIA.X540.Project3.API private string _sessionCloseEndpoint = "/session/close"; [SerializeField] - private string _chatEndpoint = "/chat/"; + private string _chatEndpoint = "/chat"; [SerializeField] private string _llmAgentEndpoint = "/agent/ask"; @@ -38,7 +38,13 @@ namespace PPGIA.X540.Project3.API private string _ttsEndpoint = "/tts/synthesize"; [SerializeField] - private string _sttEndpoint = "/stt/upload"; + private string _sttUploadEndpoint = "/transcript/get-upload-url"; + + [SerializeField] + private string _sttStartEndpoint = "/transcript/start"; + + [SerializeField] + private string _sttDownloadEndpoint = "/transcript/download"; [Header("API Settings & Workload")] [SerializeField] @@ -73,7 +79,10 @@ namespace PPGIA.X540.Project3.API void Awake() { - _audioSource = GetComponent(); + if (_audioSource == null) + _audioSource = GetComponent(); + if (_audioSource == null) + Debug.LogWarning("AudioSource component is missing."); } #region -- API Calls -------------------------------------------------- @@ -131,6 +140,135 @@ namespace PPGIA.X540.Project3.API })); } + public void UploadAudioClip( + string localFilePath, Action uploadCompletedCallback = null) + { + if (_session == null) + { + Debug.LogWarning("No active session. Please initiate a session first."); + return; + } + + StopAllCoroutines(); + + var url = EndpointUrl(_sttUploadEndpoint, _session.SessionId); + var payload = new + { + filename = Path.GetFileName(localFilePath), + content_type = "audio/wav" + }; + + StartCoroutine(ApiClient.CallEndpointWithPostCoroutine( + url, _timeoutInSeconds, payload, (request) => + { + var body = request.downloadHandler?.text ?? string.Empty; + var uploadUrl = JsonUtility.FromJson(body)?.UploadUrl; + var s3Key = JsonUtility.FromJson(body)?.S3Key; + if (uploadUrl == null) + { + Debug.LogWarning("Failed to get upload URL."); + return; + } + + StartCoroutine(ApiClient.UploadAudioDataCoroutine( + uploadUrl, localFilePath, _timeoutInSeconds, (uploadRequest) => + { + Debug.Log($"Audio upload complete: {uploadRequest.responseCode}"); + uploadCompletedCallback?.Invoke(s3Key); + })); + })); + } + + + [ContextMenu("STT/Upload Audio Clip")] + public void StartTranscript(string s3Key, + Action transcriptStartedCallback = null) + { + // Ensure there is an active session + if (_session == null) + { + Debug.LogWarning("No active session. Please initiate a session first."); + return; + } + if (string.IsNullOrEmpty(s3Key)) + { + Debug.LogWarning("No file path provided for upload."); + return; + } + + StopAllCoroutines(); + + // Build the endpoint URL + var url = EndpointUrl(_sttStartEndpoint); + var payload = new STTUploadResponse { + s3_key = s3Key + }; + + // Make the API call to upload the audio clip + StartCoroutine(ApiClient.CallEndpointWithPostCoroutine( + url, _timeoutInSeconds, payload, (request) => + { + var body = request.downloadHandler?.text ?? string.Empty; + var response = ApiModel.FromJson(body); + var jobName = response?.JobName; + + Debug.Log($"Transcription job started: {jobName}"); + transcriptStartedCallback?.Invoke(jobName); + })); + } + + [ContextMenu("STT/Download Transcription")] + public void DownloadTranscription(string jobName, + Action transcriptionReceivedCallback = null) + { + // Ensure there is an active session + if (_session == null) + { + Debug.LogWarning("No active session. Please initiate a session first."); + return; + } + + StopAllCoroutines(); + + StartCoroutine(KeepCallingCoroutine( + EndpointUrl(_sttDownloadEndpoint, jobName), .5f, + transcriptionReceivedCallback + )); + + } + + private IEnumerator KeepCallingCoroutine(string url, + float delayInSeconds, Action callback) + { + // Make the API call to download the transcription + var wait = new WaitForSeconds(delayInSeconds); + + bool keepCalling = true; + while (keepCalling) + { + yield return wait; + yield return ApiClient.CallEndpointWithGetCoroutine( + url, _timeoutInSeconds, (request) => + { + var body = request.downloadHandler?.text ?? string.Empty; + var response = ApiModel.FromJson(body); + + if (response.Status == "FAILED") + { + keepCalling = false; + Debug.LogError("Transcription job failed."); + callback?.Invoke(null); + } + else if (response.Status == "COMPLETED") + { + keepCalling = false; + callback?.Invoke(response?.Transcript); + } + }); + } + } + + [ContextMenu("Chat/Send Message")] public void SendChatMessage(string message = null, Action responseReceivedCallback = null, diff --git a/Assets/_Client/Scripts/API/ApiModel.cs b/Assets/_Client/Scripts/API/ApiModel.cs index 5f4431b..b073dc0 100644 --- a/Assets/_Client/Scripts/API/ApiModel.cs +++ b/Assets/_Client/Scripts/API/ApiModel.cs @@ -48,6 +48,30 @@ namespace PPGIA.X540.Project3.API public int ExpiresIn => expires_in; } + [Serializable] + public class STTUploadResponse : ApiModel + { + public string upload_url; + public string s3_key; + + public string UploadUrl => upload_url; + public string S3Key => s3_key; + } + + [Serializable] + public class STTJobResponse : ApiModel + { + public string job_name; + public string s3_uri; + public string status; + public string transcript; + + public string JobName => job_name; + public string S3Uri => s3_uri; + public string Status => status; + public string Transcript => transcript; + } + internal enum Environment { Development, diff --git a/Assets/_Client/Scripts/Core/AppManager.cs b/Assets/_Client/Scripts/Core/AppManager.cs index 77bd7fa..75bfcf6 100644 --- a/Assets/_Client/Scripts/Core/AppManager.cs +++ b/Assets/_Client/Scripts/Core/AppManager.cs @@ -7,6 +7,10 @@ namespace PPGIA.X540.Project3 { public class AppManager : MonoBehaviour { + // Singleton instance + public static AppManager Instance { get; private set; } + + #region -- Fields & Properties ---------------------------------------- [Header("References")] [SerializeField] private UIController _uiController; @@ -14,59 +18,129 @@ namespace PPGIA.X540.Project3 [SerializeField] private ApiClientManager _apiManager; + [SerializeField] + private AudioCapture _audioCapture; + + private AudioClip _recordedClip; + #endregion ------------------------------------------------------------ + + #region -- MonoBehaviour Methods -------------------------------------- private void Awake() { - if (_uiController == null) - _uiController = GetComponent(); - if (_apiManager == null) - _apiManager = GetComponent(); - } + // Singleton pattern implementation + if (Instance == null) + { + Instance = this; + DontDestroyOnLoad(gameObject); + } + else + { + Destroy(gameObject); + return; + } + + if (_uiController == null) + { + Debug.LogError("UIController reference is missing in AppManager."); + } + if (_apiManager == null) + { + Debug.LogError("ApiClientManager reference is missing in AppManager."); + } + if (_audioCapture == null) + { + Debug.LogError("AudioCapture reference is missing in AppManager."); + } + + _uiController.OnTalkButtonClicked += HandleTalkButtonClicked; + _audioCapture.OnRecordingSaved += HandleClipSaved; - void Start() - { - _apiManager.CloseSession( - () => _uiController.SessionActive = _apiManager.IsSessionActive); } private void OnEnable() { - _uiController.OnSessionButtonClicked += HandleSessionButtonClicked; - _uiController.OnSendChatButtonClicked += HandleSendChatButtonClicked; + _apiManager.InitiateSession(() => + { + Debug.Log("API session initiated successfully."); + }); } - private void OnDisable() + void OnDisable() { - _uiController.OnSessionButtonClicked -= HandleSessionButtonClicked; - _uiController.OnSendChatButtonClicked -= HandleSendChatButtonClicked; + if (_apiManager != null && _apiManager.IsSessionActive) + { + _apiManager.CloseSession(() => + { + Debug.Log("API session closed successfully."); + }); + } } - private void HandleSessionButtonClicked() + private void OnDestroy() + { + _uiController.OnTalkButtonClicked -= HandleTalkButtonClicked; + _audioCapture.OnRecordingSaved -= HandleClipSaved; + } + #endregion ------------------------------------------------------------ + + private void HandleClipSaved(string filePath) + { + Debug.Log($"Audio clip saved at: {filePath}"); + + _apiManager.UploadAudioClip( + filePath, + (s3Key) => + { + Debug.Log($"Clip uploaded to: {s3Key}"); + _apiManager.StartTranscript( + s3Key, + (jobName) => + { + Debug.Log($"Transcription job started: {jobName}"); + _apiManager.DownloadTranscription(jobName, (transcript) => + { + Debug.Log($"Transcription completed: {transcript}"); + _uiController.AppendChatOutput($"\nUser: {transcript}\n"); + + _apiManager.SendChatMessage(transcript, + (response) => + { + _uiController.AppendChatOutput($"Bot: {response}\n"); + }, () => + { + // Speech synthesis finished. + }); + }); + }); + }); + + _uiController.CurrentState = UIController.UIState.Idle; + } + + private void HandleTalkButtonClicked() { if (!_apiManager.IsSessionActive) { - _apiManager.InitiateSession( - () => _uiController.SessionActive = _apiManager.IsSessionActive); + Debug.LogWarning("Session is not active. Cannot send message."); + return; } - else - { - _apiManager.CloseSession( - () => _uiController.SessionActive = _apiManager.IsSessionActive - ); - } - } - private void HandleSendChatButtonClicked(string message) - { - _apiManager.SendChatMessage(message, - (responseMessage) => + switch (_uiController.CurrentState) { - _uiController.ChatOutput += $"User: {message}\n"; - _uiController.ChatOutput += $"Bot: {responseMessage}\n"; - }, - () => - { - // Speech finished callback (optional) - }); + case UIController.UIState.Idle: + _audioCapture.StartRecording(); + _uiController.CurrentState = UIController.UIState.Recording; + break; + + case UIController.UIState.Recording: + _audioCapture.StopRecording(); + _uiController.CurrentState = UIController.UIState.Idle; + break; + + case UIController.UIState.Processing: + Debug.Log("Currently processing. Please wait."); + break; + } } } } diff --git a/Assets/_Client/Scripts/AudioCapture.cs b/Assets/_Client/Scripts/Core/AudioCapture.cs similarity index 96% rename from Assets/_Client/Scripts/AudioCapture.cs rename to Assets/_Client/Scripts/Core/AudioCapture.cs index 7c7c926..97d53e9 100644 --- a/Assets/_Client/Scripts/AudioCapture.cs +++ b/Assets/_Client/Scripts/Core/AudioCapture.cs @@ -18,6 +18,7 @@ namespace PPGIA.X540.Project3 Hz96000 = 96000 } + #region -- Fields & Properties ---------------------------------------- [Header("Audio Capture Settings")] [SerializeField] private SampleRate _sampleRateInHz = SampleRate.Hz44100; @@ -54,9 +55,14 @@ namespace PPGIA.X540.Project3 private List _capturedSamples = new List(1024 * 32); // filled only on Stop private int _channels = 1; // microphone channel count (Unity usually mono) + private AudioClip _recordingClip; + public AudioClip GetRecordedClip() => _recordingClip; + private string _currentDevice; + #endregion ------------------------------------------------------------ + #region -- MonoBehaviour Methods -------------------------------------- private void Awake() { _audioSource = GetComponent(); @@ -67,6 +73,7 @@ namespace PPGIA.X540.Project3 { StopRecording(); } + #endregion ------------------------------------------------------------ [ContextMenu("Start recording audio")] public void StartRecording() @@ -215,7 +222,6 @@ namespace PPGIA.X540.Project3 writer.Write(dataBytes); } LastSavedFilePath = filePath; - Debug.Log($"Audio saved to: {filePath}"); OnRecordingSaved?.Invoke(filePath); } catch (Exception ex) diff --git a/Assets/_Client/Scripts/AudioCapture.cs.meta b/Assets/_Client/Scripts/Core/AudioCapture.cs.meta similarity index 100% rename from Assets/_Client/Scripts/AudioCapture.cs.meta rename to Assets/_Client/Scripts/Core/AudioCapture.cs.meta