o
    y	i'<                     @   s   d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlmZ ddlZddlZddlmZ ddlmZ ddlmZ G dd	 d	Ze ZdS )
za
Speech-to-Text Service for MCube Voice Bot
Handles audio transcription using OpenAI Whisper API
    N)OptionalDictAnyList)datetime)Config)Log)AudioNoiseServicec                	   @   s   e Zd ZdZdd Zdededee fddZd	ee dedee fd
dZ	de
dedee
 fddZde
de
fddZdde
dede
fddZde
dedee fddZdeeeef  dedeeeef  fddZdS )SpeechToTextServicez
    Service for converting audio to text using OpenAI Whisper API.
    Handles audio format conversion and transcription.
    c                 C   sr   t j| _d| _g | _t | _t j| _	t j
| _t j| _t | _| j	r2td| j d| j d d S td d S )Nz.https://api.openai.com/v1/audio/transcriptionsu0   🎤 Speech-to-Text Service initialized (Model: z, Language: )u4   🎤 Speech-to-Text Service initialized but disabled)r   ZOPENAI_API_KEYopenai_api_keywhisper_urlZaudio_bufferasyncioZLockZbuffer_lockZSPEECH_TO_TEXT_ENABLEDenabledZSTT_LANGUAGElanguageZ	STT_MODELmodelr	   noise_servicer   info)self r   "services/speech_to_text_service.py__init__   s   
 zSpeechToTextService.__init__
audio_data
session_idreturnc              
      sB  | j std|  dS ztt|}td| dt| d t|dk r6tdt| d W dS | ||I dH }|sLtd|  W dS td	|  | ||I dH }|rx|	 rxt
d
| d|dd  d |	 W S t
d|  W dS  ty } ztd| d|  W Y d}~dS d}~ww )a&  
        Transcribe base64 encoded audio data to text using OpenAI Whisper.
        
        Args:
            audio_data: Base64 encoded audio data
            session_id: Session ID for logging
            
        Returns:
            Transcribed text or None if transcription fails
        uA   🎤 Speech-to-text disabled, skipping transcription for session Nu   🎤 Decoded audio for session :  bytesi  u(   🎤 Audio too short for transcription: u2   ⚠️ Failed to process audio format for session u%   🎤 Calling Whisper API for session u#   🎤 Transcribed audio for session : '2   ...'u-   🎤 No speech detected in audio for session u)   ❌ Error transcribing audio for session )r   r   debugbase64	b64decodelen_process_audio_formatwarning_call_whisper_apistripr   	Exceptionerror)r   r   r   audio_bytesprocessed_audiotranscriptioner   r   r   transcribe_audio*   s4   
 
z$SpeechToTextService.transcribe_audioaudio_chunksc           	         s~  | j std|  dS z|sW dS d}|D ]*}zt|}||7 }W q tyB } ztd| d|  W Y d}~qd}~ww |sPtd|  W dS t|dk rdtdt| d	 W dS | ||I dH }|srW dS | 	||I dH }|r|
 rtd
| d|dd  d |
 W S td|  W dS  ty } ztd| d|  W Y d}~dS d}~ww )a)  
        Transcribe multiple audio chunks by combining them first.
        
        Args:
            audio_chunks: List of base64 encoded audio chunks
            session_id: Session ID for logging
            
        Returns:
            Transcribed text or None if transcription fails
        uH   🎤 Speech-to-text disabled, skipping buffer transcription for session N    u0   ⚠️ Failed to decode audio chunk for session r   u)   ⚠️ No valid audio chunks for session @  u1   🎤 Combined audio too short for transcription: r   u,   🎤 Transcribed combined audio for session r   r   r   u6   🎤 No speech detected in combined audio for session u0   ❌ Error transcribing audio buffer for session )r   r   r    r!   r"   r(   r%   r#   r$   r&   r'   r   r)   )	r   r/   r   Zcombined_audiochunkr*   r-   r+   r,   r   r   r   transcribe_audio_bufferX   sJ   
 
z+SpeechToTextService.transcribe_audio_bufferr*   c              
      s   zF|  |}tjr| j||}|dur|}| j|dd}t|dk r2tdt| d W dS td| dt| d	t| d |W S  t	ye } zt
d
| d|  W Y d}~dS d}~ww )uY  
        Process audio format for Whisper API compatibility.
        Converts from MCube format (μ-law) to WAV format and applies noise reduction.
        
        Args:
            audio_bytes: Raw audio bytes
            session_id: Session ID for logging
            
        Returns:
            Processed audio bytes in WAV format
        Nr1   )sample_ratei>  u   🎵 WAV file too short: r   u(   🎵 Processed audio format for session r   z -> u.   ❌ Error processing audio format for session )_mulaw_to_pcmr   ZAUDIO_NOISE_REDUCTION_ENABLEDr   Zprocess_audio_chunk_create_wav_filer#   r   r    r(   r)   )r   r*   r   pcm_dataZprocessed_pcmZwav_datar-   r   r   r   r$      s$   
&z)SpeechToTextService._process_audio_format
mulaw_datac                 C   s>   g d}t  }|D ]}|| }||jdddd q	t|S )u   
        Convert μ-law encoded audio to PCM.
        
        Args:
            mulaw_data: μ-law encoded audio bytes
            
        Returns:
            PCM audio bytes
        (   iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiDiiDiiDiiDiiDiiDiiDiiDiii$idiii$idiii$idiii$idiiiiii4iTitiiiiii4iTitiiiiiiiiiii,i<iLi\ili|iiiiiiiiiiiiiiir   i|}  i|y  i|u  i|q  i|m  i|i  i|e  i|a  i|]  i|Y  i|U  i|Q  i|M  i|I  i|E  i|A  i|>  i|<  i|:  i|8  i|6  i|4  i|2  i|0  i|.  i|,  i|*  i|(  i|&  i|$  i|"  i|   i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i<  i  i<  i  i<  i  i<  i  i<  i
  i<
  i	  i<	  i  i<  i  i\  i  i  i  i\  i  i  i  i\  i  i  i  i\  i  i  i  il  iL  i,  i  i  i  i  i  il  iL  i,  i  i  i  i  i  it  id  iT  iD  i4  i$  i  i                          x   p   h   `   X   P   H   @   8   0   (                r      littleT)	byteorderZsigned)	bytearrayextendto_bytesbytes)r   r8   Zmulaw_tabler7   ZbyteZ	pcm_valuer   r   r   r5      s   #z!SpeechToTextService._mulaw_to_pcmr1   r7   r4   c                 C   s&  d}d}|| | d }|| d }t  }|d |t|d jddd |d	 |d
 |djddd |djddd ||jddd ||jddd ||jddd ||jddd ||jddd |d |t|jddd || | S )z
        Create WAV file from PCM data.
        
        Args:
            pcm_data: PCM audio data
            sample_rate: Sample rate in Hz
            
        Returns:
            WAV file bytes
           rN   rO   s   RIFF$      rQ   )rR   s   WAVEs   fmt rP   s   data)ioBytesIOwriter#   rU   getvalue)r   r7   r4   ZchannelsZbits_per_sampleZ	byte_rateZblock_alignZ
wav_bufferr   r   r   r6      s(   




z$SpeechToTextService._create_wav_filec           
   
      s  zdd| j  i}t }|jd|ddd |d| j |d| j |d	d
 t 4 I dH }|j| j||tj	ddd4 I dH m}|j
dkr|| I dH }td| d|dd  d | W  d  I dH  W  d  I dH  W S | I dH }td| d|j
 d|  tdt| d 	 W d  I dH  W d  I dH  W dS 1 I dH sw   Y  W d  I dH  W dS 1 I dH sw   Y  W dS  tjy   td|  Y dS  ty }	 ztd| d|	  W Y d}	~	dS d}	~	ww )z
        Call OpenAI Whisper API for transcription.
        
        Args:
            audio_data: WAV audio data
            session_id: Session ID for logging
            
        Returns:
            Transcribed text or None if failed
        ZAuthorizationzBearer filez	audio.wavz	audio/wav)filenameZcontent_typer   r   Zresponse_formattextN   )total)headersdataZtimeout   u&   🎤 Whisper API response for session r   d   r   u"   ❌ Whisper API error for session r   z - u(   🎤 Audio data size that caused error: r   u$   ❌ Whisper API timeout for session u*   ❌ Error calling Whisper API for session )r   aiohttpZFormDataZ	add_fieldr   r   ZClientSessionZpostr   ZClientTimeoutstatusr`   r   r    r'   r)   r#   r   TimeoutErrorr(   )
r   r   r   rc   rd   ZsessionZresponser,   Z
error_textr-   r   r   r   r&     sN   

 2z%SpeechToTextService._call_whisper_apiaudio_segmentsc           
   
      s   z>g }|D ])}| d}| d}|r/| ||I dH }|||t|o&| d}|| qtdt| d|  |W S  ty^ }	 zt	d| d|	  g W  Y d}	~	S d}	~	ww )	a  
        Transcribe multiple audio segments in batch.
        
        Args:
            audio_segments: List of audio segments with metadata
            session_id: Session ID for logging
            
        Returns:
            List of transcribed segments with text
        r   	timestampN)rk   r   r,   Z
has_speechu   🎤 Batch transcribed z segments for session u-   ❌ Error in batch transcription for session r   )
getr.   boolr'   appendr   r   r#   r(   r)   )
r   rj   r   Ztranscribed_segmentsZsegmentr   rk   r,   Ztranscribed_segmentr-   r   r   r   batch_transcribeE  s,   


z$SpeechToTextService.batch_transcribeN)r1   )__name__
__module____qualname____doc__r   strr   r.   r   r3   rV   r$   r5   intr6   r&   r   r   ro   r   r   r   r   r
      s    .9'6&21r
   )rs   r   r!   rZ   ZjsonZwavetypingr   r   r   r   r   rg   ZaiofilesZconfigr   Zservices.log_utilsr   Zservices.audio_noise_servicer	   r
   Zspeech_to_text_servicer   r   r   r   <module>   s"      
Y