o
    i̖i\                     @  s   d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlZd dlZd dlmZmZ d dlmZ d dlmZmZ d d	lmZ d
dlmZ d
dlmZ dZeG dd dZG dd dejjZG dd dejj Z dS )    )annotationsN)	dataclass)Path)Literal)agentsrtc)utils)	NOT_GIVEN
NotGivenOr)is_given   )
onnx_modelloggerg?c                   @  sF   e Zd ZU ded< ded< ded< ded< ded< ded< ded	< d
S )_VADOptionsfloatmin_speech_durationmin_silence_durationprefix_padding_durationmax_buffered_speechactivation_thresholddeactivation_thresholdintsample_rateN)__name__
__module____qualname____annotations__ r   r   \/var/www/html/livekit_bhavya/venv/lib/python3.10/site-packages/livekit/plugins/silero/vad.pyr   )   s   
 r   c                      s   e Zd ZdZedddddddeeed
d-ddZd. fd d!Zed/d#d$Z	ed/d%d&Z
d0d(d)Zeeeeeed*d1d+d,Z  ZS )2VADz
    Silero Voice Activity Detection (VAD) class.

    This class provides functionality to detect speech segments within audio data using the Silero VAD model.
    g?g?g      ?g      N@i>  T)
r   r   r   r   r   r   	force_cpuonnx_file_pathr   padding_durationr   r   r   r   r   r   r   Literal[8000, 16000]r!   boolr"   NotGivenOr[Path | str]r   NotGivenOr[float]r#   returnc       
      	   C  s   |t jvr	tdt|
rtd |
}t|	r |	dkr tdt j||p&dd}t||||||	p7t|d d|d	}| ||d
S )a  
        Load and initialize the Silero VAD model.

        This method loads the ONNX model and prepares it for inference. When options are not provided,
        sane defaults are used.

        **Note:**
            This method is blocking and may take time to load the model into memory.
            It is recommended to call this method inside your prewarm mechanism.

        **Example:**

            ```python
            def prewarm(proc: JobProcess):
                proc.userdata["vad"] = silero.VAD.load()


            async def entrypoint(ctx: JobContext):
                vad = (ctx.proc.userdata["vad"],)
                # your agent logic...


            if __name__ == "__main__":
                cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm))
            ```

        Args:
            min_speech_duration (float): Minimum duration of speech to start a new speech chunk.
            min_silence_duration (float): At the end of each speech, wait this duration before ending the speech.
            prefix_padding_duration (float): Duration of padding to add to the beginning of each speech chunk.
            max_buffered_speech (float): Maximum duration of speech to keep in the buffer (in seconds).
            activation_threshold (float): Threshold to consider a frame as speech.
            sample_rate (Literal[8000, 16000]): Sample rate for the inference (only 8KHz and 16KHz are supported).
            onnx_file_path (Path | str | None): Path to the ONNX model file. If not provided, the default model will be loaded. This can be helpful if you want to use a previous version of the silero model.
            force_cpu (bool): Force the use of CPU for inference.
            deactivation_threshold (float): Negative threshold (noise or exit threshold). If model's current state is SPEECH, values BELOW this value are considered as NON-SPEECH. Default is max(activation_threshold - 0.15, 0.01).
            padding_duration (float | None): **Deprecated**. Use `prefix_padding_duration` instead.

        Returns:
            VAD: An instance of the VAD class ready for streaming.

        Raises:
            ValueError: If an unsupported sample rate is provided.
        z4Silero VAD only supports 8KHz and 16KHz sample ratesz`padding_duration is deprecated and will be removed in 1.5.0, use prefix_padding_duration insteadr   z-deactivation_threshold must be greater than 0N)r"   g333333?g{Gz?)r   r   r   r   r   r   r   )sessionopts)	r   SUPPORTED_SAMPLE_RATES
ValueErrorr   r   warningnew_inference_sessionr   max)clsr   r   r   r   r   r   r!   r"   r   r#   r)   r*   r   r   r   load;   s(   
<	zVAD.loadr)   onnxruntime.InferenceSessionr*   r   Nonec                  s6   t  jtjjddd || _|| _tjt	  | _
d S )NgMb?)update_interval)capabilities)super__init__r   vadVADCapabilities_onnx_session_optsweakrefWeakSet	VADStream_streams)selfr)   r*   	__class__r   r   r7      s   zVAD.__init__strc                 C     dS )Nsileror   r@   r   r   r   model      z	VAD.modelc                 C  rD   )NONNXr   rF   r   r   r   provider   rH   zVAD.providerr>   c                 C  s.   t | | jtj| j| jjd}| j| |S )z
        Create a new VADStream for processing audio data.

        Returns:
            VADStream: A stream object for processing audio input and detecting speech.
        )onnx_sessionr   )r>   r;   r   	OnnxModelr:   r   r?   add)r@   streamr   r   r   rN      s   
z
VAD.streamr   r   r   r   r   r   c             	   C  s   t |r|| j_t |r|| j_t |r|| j_t |r || j_t |r(|| j_t |r0|| j_| jD ]}|j	||||||d q3dS )a  
        Update the VAD options.

        This method allows you to update the VAD options after the VAD object has been created.

        Args:
            min_speech_duration (float): Minimum duration of speech to start a new speech chunk.
            min_silence_duration (float): At the end of each speech, wait this duration before ending the speech.
            prefix_padding_duration (float): Duration of padding to add to the beginning of each speech chunk.
            max_buffered_speech (float): Maximum duration of speech to keep in the buffer (in seconds).
            activation_threshold (float): Threshold to consider a frame as speech.
        rO   N)
r   r;   r   r   r   r   r   r   r?   update_options)r@   r   r   r   r   r   r   rN   r   r   r   rP      s,   
zVAD.update_options)r   r   r   r   r   r   r   r   r   r   r   r$   r!   r%   r"   r&   r   r'   r#   r'   r(   r    )r)   r2   r*   r   r(   r3   )r(   rC   )r(   r>   r   r'   r   r'   r   r'   r   r'   r   r'   r   r'   r(   r3   )r   r   r   __doc__classmethodr	   r1   r7   propertyrG   rJ   rN   rP   __classcell__r   r   rA   r   r    4   s6    S
r    c                      sP   e Zd Zd fd	d
ZeeeeeeddddZejje	ddddZ
  ZS )r>   r8   r    r*   r   rG   onnx_model.OnnxModelr(   r3   c                   sN   t  | ||| _| _t | _tjdd| _	d| _
d | _d| _d| _d S )Ngffffff?)alphar   F)r6   r7   r;   _modelasyncioget_event_loop_loopr   	ExpFilter_exp_filter_input_sample_rate_speech_buffer_speech_buffer_max_reached_prefix_padding_samples)r@   r8   r*   rG   rA   r   r   r7      s   

zVADStream.__init__rO   r   r'   r   r   r   r   r   c                C  s   | j j}t|r|| j _t|r|| j _t|r|| j _t|r$|| j _t|r,|| j _t|r4|| j _| jrc| j	dus>J t
| j j| j | _| j	t
| j j| j | j  | j j|kred| _dS dS dS )aJ  
        Update the VAD options.

        This method allows you to update the VAD options after the VAD object has been created.

        Args:
            min_speech_duration (float): Minimum duration of speech to start a new speech chunk.
            min_silence_duration (float): At the end of each speech, wait this duration before ending the speech.
            prefix_padding_duration (float): Duration of padding to add to the beginning of each speech chunk.
            max_buffered_speech (float): Maximum duration of speech to keep in the buffer (in seconds).
            activation_threshold (float): Threshold to consider a frame as speech.
            deactivation_threshold (float): Negative threshold (noise or exit threshold). If model's current state is SPEECH, values BELOW this value are considered as NON-SPEECH.
        NF)r;   r   r   r   r   r   r   r   r^   r_   r   ra   resizer`   )r@   r   r   r   r   r   r   old_max_buffered_speechr   r   r   rP      s6   
zVADStream.update_optionsr   c                   s  t j jjt jd}dd}d}d}d}d}d}d}g }	g }
d }d}d} j2 z>3 d H W }t|tjs6q( j	so|j
 _	t jj j	  _t jt jj j	  j t jd _ j	 jj
krntj j	 jj
tjjd}n j	|j
kr{td q( jd usJ |	| |d ur|
|| n|
| 	 t }tdd	 |
D }| jjk rnt|	}t|
}t j|j d  jj t !t jj"|t jd
  j#$d  j|I d H } j%j&d|d} jj jj
 }| jj7 }||7 } j	 jj
 } jj| | }t|}|| }t' j }t(||}|dkr0|j d |  j| < |7 n j)s<d _)t*d t | }t"d|| | }|t+krYtj*dd|id d fdd}d fdd}|rq||7 }n||7 } j,-t.j/j0t.j/j1j2||||||tj|j d | 3  j	d|dg|||d | jj4ks|r| jj5kr||7 }d}|s| jj6krd}d}|} j,-t.j/j0t.j/j1j7||||| gdd n:||7 }d}|s|  |r| jj8krd}|} j,-t.j/j0t.j/j1j9|||t"d|| | gdd d}|  g }	g }
t'|j | dkr=|j |d  3 }|	tj| j	dt'|d d t'|j  jj dkrf|j  jjd  3 }|
tj| jj
dt'|d d qq(6 d S )N)dtyper   Fg        )
input_rateoutput_ratequalityz3a frame with another sample rate was already pushedTc                 S  s   g | ]}|j qS r   )samples_per_channel).0framer   r   r   
<listcomp>g  s    z(VADStream._main_task.<locals>.<listcomp>)outrd   g      ?)expsamplezOmax_buffered_speech reached, ignoring further data for the current speech inputz!inference is slower than realtimedelay)extrar(   r3   c                    sP    j d usJ  jkrd S  j  j  } d _|  j d  j<  jd S )NF)r_   ra   r`   )padding_datar@   speech_buffer_indexr   r   _reset_write_cursor  s   

z1VADStream._main_task.<locals>._reset_write_cursorrtc.AudioFramec                    s4    j d usJ  j d   } tj jd| dS )Nr   )r   num_channelsrh   data)r_   tobytesr   
AudioFramer^   )speech_datarr   r   r   _copy_speech_buffer  s   z1VADStream._main_task.<locals>._copy_speech_bufferr   )rw   r   rv   rh   )typesamples_index	timestampsilence_durationspeech_durationprobabilityinference_durationframesspeakingraw_accumulated_silenceraw_accumulated_speech)r|   r}   r~   r   r   r   r      r(   r3   )r(   ru   ):npemptyrX   window_size_samplesfloat32	_input_ch
isinstancer   ry   r^   r   r   r;   r   ra   r   int16r_   AudioResamplerAudioResamplerQualityQUICKr   errorappendextendpushtimeperf_countersumr   combine_framesdividerw   iinfor/   r[   run_in_executorr]   applylenminr`   r-   SLOW_INFERENCE_THRESHOLD	_event_chsend_nowaitr   r8   VADEventVADEventTypeINFERENCE_DONErx   r   r   r   START_OF_SPEECHr   END_OF_SPEECH)r@   inference_f32_datapub_speakingpub_speech_durationpub_silence_durationpub_current_samplepub_timestampspeech_threshold_durationsilence_threshold_durationinput_framesinference_frames	resamplerinput_copy_remaining_fractextra_inference_timeinput_frame
start_timeavailable_inference_samplesinference_framepwindow_durationresampling_ratioto_copyto_copy_intavailable_spaceto_copy_bufferr   rt   r{   rw   r   rr   r   
_main_task"  s~  












	
 zVADStream._main_task)r8   r    r*   r   rG   rV   r(   r3   rQ   r   )r   r   r   r7   r	   rP   r   r   log_exceptionsr   r   rU   r   r   rA   r   r>      s    5r>   )!
__future__r   rY   r   r<   dataclassesr   pathlibr   typingr   numpyr   onnxruntimelivekitr   r   livekit.agentsr   livekit.agents.typesr	   r
   livekit.agents.utilsr    r   logr   r   r   r8   r    r>   r   r   r   r   <module>   s*   
 .