o
    i                     @  s   d dl mZ d dlZd dlZd dlmZmZ d dlmZm	Z	 ddl
mZmZ ddlmZ eed	d Zer<dd
lmZ G dd de	ZeG dd dZG dd dZdS )    )annotationsN)	dataclassfield)TYPE_CHECKINGProtocol   )LLMChatContext   )JudgmentResultLIVEKIT_EVALS_VERBOSE)	LLMModelsc                   @  s0   e Zd ZdZedddZddddddZdS )	Evaluatorz9Protocol for any object that can evaluate a conversation.returnstrc                 C  s   dS )z Name identifying this evaluator.N selfr   r   a/var/www/html/livekit_bhavya/venv/lib/python3.10/site-packages/livekit/agents/evals/evaluation.pyname   s   zEvaluator.nameN)	referencellmchat_ctxr	   r   ChatContext | Noner   
LLM | Noner   c                  s   d S Nr   )r   r   r   r   r   r   r   evaluate   s   zEvaluator.evaluate)r   r   )r   r	   r   r   r   r   r   r   )__name__
__module____qualname____doc__propertyr   r   r   r   r   r   r      s    r   c                   @  sl   e Zd ZU dZeedZded< 	 edddZ	edd
dZ
edddZedddZedddZdS )EvaluationResultz;Result of evaluating a conversation with a group of judges.)default_factoryzdict[str, JudgmentResult]	judgmentsr   floatc                 C  sJ   | j sdS d}| j  D ]}|jr|d7 }q|jr|d7 }q|t| j  S )z1Score from 0.0 to 1.0. Pass=1, maybe=0.5, fail=0.g        g      ?g      ?)r$   valuespassed	uncertainlen)r   totaljr   r   r   score)   s   
zEvaluationResult.scoreboolc                 C     t dd | j D S )z9True if all judgments passed. Maybes count as not passed.c                 s      | ]}|j V  qd S r   r'   .0r+   r   r   r   	<genexpr>9       z.EvaluationResult.all_passed.<locals>.<genexpr>)allr$   r&   r   r   r   r   
all_passed6      zEvaluationResult.all_passedc                 C  r.   )z%True if at least one judgment passed.c                 s  r/   r   r0   r1   r   r   r   r3   >   r4   z.EvaluationResult.any_passed.<locals>.<genexpr>anyr$   r&   r   r   r   r   
any_passed;   r7   zEvaluationResult.any_passedc                 C  s   | j sdS | jt| j d kS )z/True if more than half of the judgments passed.Tr   )r$   r,   r)   r   r   r   r   majority_passed@   s   z EvaluationResult.majority_passedc                 C  s   t dd | j D  S )z;True if no judgments explicitly failed. Maybes are allowed.c                 s  r/   r   )failedr1   r   r   r   r3   J   r4   z/EvaluationResult.none_failed.<locals>.<genexpr>r8   r   r   r   r   none_failedG   s   zEvaluationResult.none_failedN)r   r%   )r   r-   )r   r   r   r    r   dictr$   __annotations__r!   r,   r6   r:   r;   r=   r   r   r   r   r"   "   s   
 r"   c                   @  sL   e Zd ZdZdddd
dZedddZedddZdddddZdS )
JudgeGroupa  A group of judges that evaluate conversations together.

    Automatically tags the session with judgment results when called within a job context.

    Example:
        ```python
        async def on_session_end(ctx: JobContext) -> None:
            judges = JudgeGroup(
                llm="openai/gpt-4o-mini",
                judges=[
                    task_completion_judge(),
                    accuracy_judge(),
                ],
            )

            report = ctx.make_session_report()
            result = await judges.evaluate(report.chat_history)
            # Results are automatically tagged to the session
        ```
    N)judgesr   LLM | LLMModels | strrA   list[Evaluator] | Noner   Nonec                C  s6   t |trddlm} ||| _n|| _|pg | _dS )a  Initialize a JudgeGroup.

        Args:
            llm: The LLM to use for evaluation. Can be an LLM instance or a model
                string like "openai/gpt-4o-mini" (uses LiveKit inference gateway).
            judges: The judges to run during evaluation.
        r   )r   N)
isinstancer   	inferencer   _llm_judges)r   r   rA   InferenceLLMr   r   r   __init__c   s
   
zJudgeGroup.__init__r   c                 C     | j S )zThe LLM used for evaluation.)rG   r   r   r   r   r   y      zJudgeGroup.llmlist[Evaluator]c                 C  rK   )z$The judges to run during evaluation.)rH   r   r   r   r   rA   ~   rL   zJudgeGroup.judges)r   r   r	   r   r   r"   c          
        s  ddl m} ddlm d fdd	tjfd
djD  I dH }i }|D ]\}}t|tr8|||< q+t	|d}t
rptd |D ])\}}t|trdtd| d|j  td|j d qFtd| d| d qFz| }	|	j| W |S  ty   Y |S w )ab  Evaluate a conversation with all judges.

        Automatically tags the session with results when called within a job context.

        Args:
            chat_ctx: The conversation to evaluate.
            reference: Optional reference conversation for comparison.

        Returns:
            EvaluationResult containing all judgment results.
        r   )get_job_context)loggerjudger   r   *tuple[str, JudgmentResult | BaseException]c              
     sn   z| j  jdI d H }| j|fW S  ty6 } zd| j d|  | j|fW  Y d }~S d }~ww )N)r   r   r   zJudge 'z
' failed: )r   rG   r   	Exceptionwarning)rP   resulte)r   rO   r   r   r   r   	run_judge   s   z&JudgeGroup.evaluate.<locals>.run_judgec                   s   g | ]} |qS r   r   r1   )rV   r   r   
<listcomp>   s    z'JudgeGroup.evaluate.<locals>.<listcomp>N)r$   z!
+ JudgeGroup evaluation results:z  [z
] verdict=z    reasoning: 
z	] ERROR: )rP   r   r   rQ   )jobrN   logrO   asynciogatherrH   rE   r   r"   _evals_verboseprintverdict	reasoningtagger_evaluationRuntimeError)
r   r   r   rN   resultsr$   r   rT   evaluation_resultctxr   )r   rO   r   rV   r   r   r      s4    


zJudgeGroup.evaluate)r   rB   rA   rC   r   rD   )r   r   )r   rM   )r   r	   r   r   r   r"   )	r   r   r   r    rJ   r!   r   rA   r   r   r   r   r   r@   M   s    r@   )
__future__r   r[   osdataclassesr   r   typingr   r   r   r   r	   rP   r   intgetenvr]   rF   r   r   r"   r@   r   r   r   r   <module>   s    *