import json
import logging
import time
import os

from dotenv import load_dotenv
from livekit import rtc
from livekit.agents import (
    Agent,
    AgentServer,
    AgentSession,
    JobContext,
    JobProcess,
    cli,
    inference,
    room_io,
)
from livekit.plugins import noise_cancellation, silero
from livekit.plugins import elevenlabs
from livekit.plugins.turn_detector.multilingual import MultilingualModel
from conversation_logger import logger as conversation_logger

logger = logging.getLogger("agent")

load_dotenv(".env.local")


DEFAULT_INSTRUCTIONS = """You are a helpful voice AI. Always reply in one short sentence (under 15 words when possible). No formatting, emojis, or symbols. Be friendly. This is voice: speed matters more than detail."""


class Assistant(Agent):
    def __init__(self, instructions: str | None = None) -> None:
        super().__init__(
            instructions=instructions or DEFAULT_INSTRUCTIONS,
        )

    # To add tools, use the @function_tool decorator.
    # Here's an example that adds a simple weather tool.
    # You also have to add `from livekit.agents import function_tool, RunContext` to the top of this file
    # @function_tool
    # async def lookup_weather(self, context: RunContext, location: str):
    #     """Use this tool to look up current weather information in the given location.
    #
    #     If the location is not supported by the weather service, the tool will indicate this. You must tell the user the location's weather is unavailable.
    #
    #     Args:
    #         location: The location to look up weather information for (e.g. city name)
    #     """
    #
    #     logger.info(f"Looking up weather for {location}")
    #
    #     return "sunny with a temperature of 70 degrees."


server = AgentServer()

logger.info("=" * 60)
logger.info("AGENT SERVER INITIALIZED")
logger.info("Agent is ready to accept connections")
logger.info("=" * 60)


def _warm_elevenlabs_tts():
    """Warm ElevenLabs TTS so first 'Hello' response is faster (reduces cold start)."""
    key = os.getenv("ELEVENLABS_API_KEY") or os.getenv("ELEVEN_API_KEY")
    if not key:
        return
    try:
        tts = elevenlabs.TTS(
            voice_id="EXAVITQu4vr4xnSDxMaL",
            model="eleven_turbo_v2_5",
            api_key=key,
            streaming_latency=4,
        )
        if hasattr(tts, "prewarm"):
            tts.prewarm()
            logger.info("TTS prewarm done")
    except Exception as e:
        logger.debug("TTS prewarm failed (non-fatal): %s", e)


def prewarm(proc: JobProcess):
    logger.info("Prewarming agent process...")
    proc.userdata["vad"] = silero.VAD.load()
    # Skip TTS prewarm so worker is ready immediately (avoids "agent did not join" when job arrives during prewarm)
    logger.info("Prewarm complete - VAD loaded")


server.setup_fnc = prewarm


# agent_name must match the name requested by the frontend (room_config.agents[0].agent_name)
@server.rtc_session(agent_name="default")
async def my_agent(ctx: JobContext):
    # Logging setup
    # Add any other context you want in all log entries here
    ctx.log_context_fields = {
        "room": ctx.room.name,
    }

    # Extract custom configuration from multiple sources
    # Priority: 1) Agent dispatch metadata, 2) Room metadata, 3) Participant metadata
    metadata_dict = {}

    # Try to get metadata from agent dispatch first (minimal logging to stay within 10s join window)
    if hasattr(ctx.job, 'agent_metadata') and ctx.job.agent_metadata:
        try:
            metadata_dict = json.loads(ctx.job.agent_metadata)
            logger.info(f"Using agent dispatch metadata: {metadata_dict}")
        except json.JSONDecodeError:
            logger.warning(f"Failed to parse agent metadata as JSON: {ctx.job.agent_metadata}")

    # Fall back to room metadata if agent metadata is empty
    if not metadata_dict and ctx.room.metadata:
        try:
            metadata_dict = json.loads(ctx.room.metadata)
            logger.info(f"Using room metadata: {metadata_dict}")
        except json.JSONDecodeError:
            logger.warning(f"Failed to parse room metadata as JSON: {ctx.room.metadata}")

    # Do NOT connect before session.start() – LiveKit requires an active AgentSession (RoomIO)
    # within 10s of job_entry. Connecting early then building the session can cause
    # "room connection not established" / "agent did not join". Use defaults when metadata
    # is only on the participant (we'll use default voice/instructions for this session).

    custom_instructions = metadata_dict.get("agent_instructions") if metadata_dict else None
    custom_first_message = metadata_dict.get("agent_first_message") if metadata_dict else None
    # Default voice must exist on your ElevenLabs account (1Z7Y8o9cvUeWq8oLKgMY did not exist on this account)
    custom_voice = metadata_dict.get("agent_voice", "EXAVITQu4vr4xnSDxMaL") if metadata_dict else "EXAVITQu4vr4xnSDxMaL"
    custom_language = metadata_dict.get("agent_language", "en") if metadata_dict else "en"
    custom_llm_model = metadata_dict.get("agent_llm_model", "openai/gpt-4.1-mini") if metadata_dict else "openai/gpt-4.1-mini"
    custom_stt_model = metadata_dict.get("agent_stt_model", "default") if metadata_dict else "default"
    custom_stt_language_code = metadata_dict.get("agent_stt_language_code", "en") if metadata_dict else "en"
    custom_tts_model = metadata_dict.get("agent_tts_model", "eleven_turbo_v2_5") if metadata_dict else "eleven_turbo_v2_5"
    custom_tts_encoding = metadata_dict.get("agent_tts_encoding") if metadata_dict else None

    # Track message timing for logging
    user_message_start = None
    bot_message_start = None

    # Latency tracking: timestamps at each pipeline stage (user commit -> LLM -> TTS -> first audio)
    t_user_commit = None
    t_agent_response = None
    t_agent_speak = None

    # Set up a voice AI pipeline using ElevenLabs STT/TTS and the LiveKit turn detector
    # Support both ELEVENLABS_API_KEY and ELEVEN_API_KEY (plugin default)
    elevenlabs_key = os.getenv("ELEVENLABS_API_KEY") or os.getenv("ELEVEN_API_KEY")
    if not elevenlabs_key:
        logger.error("ELEVENLABS_API_KEY (or ELEVEN_API_KEY) not set – STT/TTS will fail. Set it in .env.local")
    stt_kwargs = {"language_code": custom_stt_language_code}
    if elevenlabs_key:
        stt_kwargs["api_key"] = elevenlabs_key
    tts_kwargs = {
        "voice_id": custom_voice,
        "model": custom_tts_model,
    }
    if elevenlabs_key:
        tts_kwargs["api_key"] = elevenlabs_key
    if custom_tts_encoding:
        tts_kwargs["encoding"] = custom_tts_encoding
    # Lowest latency: 4 = max optimizations + no text normalizer (0=default, 3=max with normalizer)
    tts_kwargs.setdefault("streaming_latency", 4)

    session = AgentSession(
        stt=elevenlabs.STT(**stt_kwargs),
        llm=inference.LLM(model=custom_llm_model),
        tts=elevenlabs.TTS(**tts_kwargs),
        turn_detection=MultilingualModel(),
        vad=ctx.proc.userdata["vad"],
        preemptive_generation=True,
        # Aggressive endpointing for <1s target: commit user turn within 0.5s of silence
        min_endpointing_delay=0.1,
        max_endpointing_delay=0.5,
    )

    # Add event handlers to log conversations and latency (Python SDK event names)

    def _ev(ev, key, default=None):
        if ev is None:
            return default
        if hasattr(ev, key):
            return getattr(ev, key)
        if isinstance(ev, dict):
            return ev.get(key, default)
        return default

    # user_input_transcribed: STT output (is_final=True = user turn committed)
    @session.on("user_input_transcribed")
    def on_user_input_transcribed(ev):
        nonlocal user_message_start, t_user_commit
        transcript = _ev(ev, "transcript") or ""
        is_final = _ev(ev, "is_final") or False
        logger.info(f"[EVENT] user_input_transcribed is_final={is_final} transcript={transcript!r}")
        if is_final:
            conversation_logger.log_user_message(transcript, None)
            t_user_commit = time.time()
            if user_message_start is not None:
                user_turn_ms = (t_user_commit - user_message_start) * 1000
                logger.info(f"[LATENCY] User turn (speaking + STT final): {user_turn_ms:.0f} ms")
            user_message_start = None
            logger.info(f"[LATENCY] user_speech_committed at T+0 (reference for LLM/TTS timing)")

    # conversation_item_added: when a message is committed (user or assistant)
    @session.on("conversation_item_added")
    def on_conversation_item_added(ev):
        nonlocal t_user_commit, t_agent_response, bot_message_start
        item = _ev(ev, "item")
        if item is None:
            return
        role = _ev(item, "role") if hasattr(item, "role") else (item.get("role") if isinstance(item, dict) else None)
        content = _ev(item, "content") if hasattr(item, "content") else (item.get("content", []) if isinstance(item, dict) else [])
        if content is None:
            content = []
        text = ""
        if content:
            parts = content if isinstance(content, list) else [content]
            for part in parts:
                t = getattr(part, "text", None)
                if t:
                    text += t
            if not text:
                text = str(content)[:200]
        if role == "assistant":
            t_agent_response = time.time()
            logger.info(f"[EVENT] conversation_item_added role=assistant text={text[:80]!r}...")
            if t_user_commit is not None:
                llm_ms = (t_agent_response - t_user_commit) * 1000
                logger.info(f"[LATENCY] LLM (user_commit -> agent_response): {llm_ms:.0f} ms")
            conversation_logger.log_bot_message(text, None)
            bot_message_start = t_agent_response
        elif role == "user":
            logger.info(f"[EVENT] conversation_item_added role=user text={text[:80]!r}...")

    # speech_created: agent started producing TTS (first audio)
    @session.on("speech_created")
    def on_speech_created(ev):
        nonlocal bot_message_start, t_user_commit, t_agent_response, t_agent_speak
        t_agent_speak = time.time()
        bot_message_start = t_agent_speak
        source = _ev(ev, "source") or "?"
        logger.info(f"[EVENT] speech_created source={source}")
        if t_user_commit is not None:
            e2e_ms = (t_agent_speak - t_user_commit) * 1000
            logger.info(f"[LATENCY] End-to-end (user_commit -> first audio): {e2e_ms:.0f} ms")
        if t_agent_response is not None:
            tts_ms = (t_agent_speak - t_agent_response) * 1000
            logger.info(f"[LATENCY] TTS (agent_response -> first audio): {tts_ms:.0f} ms")

    # agent_state_changed: e.g. idle -> speaking (backup for "agent started")
    @session.on("agent_state_changed")
    def on_agent_state_changed(ev):
        old_s = _ev(ev, "old_state")
        new_s = _ev(ev, "new_state")
        logger.info(f"[EVENT] agent_state_changed {old_s} -> {new_s}")
        if new_s == "speaking" and old_s != "speaking":
            nonlocal bot_message_start
            bot_message_start = time.time()

    # user_state_changed: e.g. when user starts speaking (for user_message_start)
    @session.on("user_state_changed")
    def on_user_state_changed(ev):
        nonlocal user_message_start
        state = _ev(ev, "new_state") or _ev(ev, "state")
        logger.info(f"[EVENT] user_state_changed state={state}")
        if state == "speaking":
            user_message_start = time.time()

    # To use a realtime model instead of a voice pipeline, use the following session setup instead.
    # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/))
    # 1. Install livekit-agents[openai]
    # 2. Set OPENAI_API_KEY in .env.local
    # 3. Add `from livekit.plugins import openai` to the top of this file
    # 4. Use the following session setup instead of the version above
    # session = AgentSession(
    #     llm=openai.realtime.RealtimeModel(voice="marin")
    # )

    # # Add a virtual avatar to the session, if desired
    # # For other providers, see https://docs.livekit.io/agents/models/avatar/
    # avatar = hedra.AvatarSession(
    #   avatar_id="...",  # See https://docs.livekit.io/agents/models/avatar/plugins/hedra
    # )
    # # Start the avatar and wait for it to join
    # await avatar.start(session, room=ctx.room)

    # Connect to room first so LiveKit sees connection within 10s; then start voice session (RoomIO).
    await ctx.connect()

    try:
        await session.start(
            agent=Assistant(instructions=custom_instructions),
            room=ctx.room,
            room_options=room_io.RoomOptions(
                audio_input=room_io.AudioInputOptions(
                    noise_cancellation=lambda params: noise_cancellation.BVCTelephony()
                    if params.participant.kind == rtc.ParticipantKind.PARTICIPANT_KIND_SIP
                    else noise_cancellation.BVC(),
                ),
            ),
        )
    except Exception as e:
        logger.exception("Failed to start voice session (check ELEVENLABS_API_KEY, OPENAI_API_KEY / LiveKit Inference): %s", e)
        raise

    # Optional: if caller provided a "first message" via agent metadata,
    # speak immediately after joining (before user says anything).
    if custom_first_message:
        try:
            session.say(custom_first_message)
            logger.info("Sent first assistant message from agent_first_message metadata")
        except Exception as e:
            logger.warning("Failed to send first assistant message: %s", e)

    # Now that we're in the room, start conversation logging and log config (no longer on critical path)
    try:
        conversation_logger.start_session(
            room_name=ctx.room.name,
            instructions=custom_instructions or DEFAULT_INSTRUCTIONS,
            voice_id=custom_voice,
            language=custom_language,
            llm_model=custom_llm_model,
        )
    except Exception as e:
        logger.error("Conversation logging start failed: %s", e)

    logger.info("Agent is in the room and listening. User can say hello.")

    # Set up cleanup handler to save logs when session ends
    async def cleanup():
        logger.info("Session ending, saving conversation log...")
        log_file = conversation_logger.end_session()
        logger.info(f"Conversation log saved to: {log_file}")

    # Register cleanup handler
    ctx.add_shutdown_callback(cleanup)


if __name__ == "__main__":
    cli.run_app(server)
