o
    (Ei	                     @   s   d dl Z d dlZd dlmZmZ d ZdZdedefddZdedefd	d
Z	dee dee fddZ
dee defddZdededefddZdS )    N)ListDict   textreturnc                 C   sh   |   } tjdd| tjd} tdd| } dddd	}| D ]\}}tj||| tjd} q|    S )
z'Normalize grammar, spacing, repetitionsz\b(yeah|uh|um)\b )flagsz\s+ refundzThe payment was made by cardzI will update this)zpay backzcard good onlyzonce i will update)stripresubIitems
capitalize)r   replacementskv r   E/var/www/html/tatsat2/dashboard-backend/clean_transcript_formatter.py
clean_text   s   r   speaker_labelc                 C   s(   t td|  d }|tkrdS dS )u(   Convert Speaker 1/2 → Agent / Customerz\d+r   AgentCustomer)intr   searchgroupAGENT_SPEAKER_ID)r   speaker_numr   r   r   map_speaker*   s   r   segmentsc                 C   sn   g }| D ]0}t |d }t|d }|sq|r,|d d |kr,|d d  d| 7  < q|||d q|S )z,Merge consecutive segments from same speakerspeakerr   r	   )r!   r   )r   r   append)r    mergedsegroler   r   r   r   merge_segments3   s   
r'   c                 C   s4   g }| D ]}| |d  d|d   qd|S )Nr!   z: r   z

)r#   join)r    linesr%   r   r   r   build_clean_transcriptL   s   
r*   raw_transcriptspeaker_segments_jsonc                 C   s$   t |}t|}t|}||dS )N)clean_transcriptclean_speaker_diarization)jsonloadsr'   r*   )r+   r,   speaker_segmentsmerged_segmentsr-   r   r   r   format_outputV   s   
r3   )r/   r   typingr   r   r   CUSTOMER_SPEAKER_IDstrr   r   r'   r*   r3   r   r   r   r   <module>   s    	
