# This file was auto-generated by Fern from our API Definition.

import json
import typing
from json.decoder import JSONDecodeError

from ..core.api_error import ApiError
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
from ..core.http_response import AsyncHttpResponse, HttpResponse
from ..core.pydantic_utilities import parse_obj_as
from ..core.request_options import RequestOptions
from ..core.serialization import convert_and_respect_annotation_metadata
from ..errors.bad_request_error import BadRequestError
from ..errors.forbidden_error import ForbiddenError
from ..errors.internal_server_error import InternalServerError
from ..errors.too_many_requests_error import TooManyRequestsError
from ..errors.unprocessable_entity_error import UnprocessableEntityError
from ..requests.chat_completion_request_message import ChatCompletionRequestMessageParams
from ..requests.chat_completion_tool import ChatCompletionToolParams
from ..requests.stop_configuration import StopConfigurationParams
from ..requests.tool_choice_option import ToolChoiceOptionParams
from ..types.chat_completion_chunk import ChatCompletionChunk
from ..types.create_chat_completion_response import CreateChatCompletionResponse
from ..types.reasoning_effort import ReasoningEffort
from ..types.sarvam_model_ids import SarvamModelIds

# this is used as the default value for optional parameters
OMIT = typing.cast(typing.Any, ...)


class RawChatClient:
    def __init__(self, *, client_wrapper: SyncClientWrapper):
        self._client_wrapper = client_wrapper

    @typing.overload
    def completions(
        self,
        *,
        messages: typing.Sequence[ChatCompletionRequestMessageParams],
        model: SarvamModelIds,
        temperature: typing.Optional[float] = ...,
        top_p: typing.Optional[float] = ...,
        reasoning_effort: typing.Optional[ReasoningEffort] = ...,
        max_tokens: typing.Optional[int] = ...,
        stream: typing.Literal[True],
        stop: typing.Optional[StopConfigurationParams] = ...,
        n: typing.Optional[int] = ...,
        seed: typing.Optional[int] = ...,
        frequency_penalty: typing.Optional[float] = ...,
        presence_penalty: typing.Optional[float] = ...,
        wiki_grounding: typing.Optional[bool] = ...,
        tools: typing.Optional[typing.Sequence[ChatCompletionToolParams]] = ...,
        tool_choice: typing.Optional[ToolChoiceOptionParams] = ...,
        request_options: typing.Optional[RequestOptions] = ...,
    ) -> typing.Iterator[ChatCompletionChunk]: ...

    @typing.overload
    def completions(
        self,
        *,
        messages: typing.Sequence[ChatCompletionRequestMessageParams],
        model: SarvamModelIds,
        temperature: typing.Optional[float] = ...,
        top_p: typing.Optional[float] = ...,
        reasoning_effort: typing.Optional[ReasoningEffort] = ...,
        max_tokens: typing.Optional[int] = ...,
        stream: typing.Optional[typing.Literal[False]] = ...,
        stop: typing.Optional[StopConfigurationParams] = ...,
        n: typing.Optional[int] = ...,
        seed: typing.Optional[int] = ...,
        frequency_penalty: typing.Optional[float] = ...,
        presence_penalty: typing.Optional[float] = ...,
        wiki_grounding: typing.Optional[bool] = ...,
        tools: typing.Optional[typing.Sequence[ChatCompletionToolParams]] = ...,
        tool_choice: typing.Optional[ToolChoiceOptionParams] = ...,
        request_options: typing.Optional[RequestOptions] = ...,
    ) -> HttpResponse[CreateChatCompletionResponse]: ...

    def completions(
        self,
        *,
        messages: typing.Sequence[ChatCompletionRequestMessageParams],
        model: SarvamModelIds,
        temperature: typing.Optional[float] = OMIT,
        top_p: typing.Optional[float] = OMIT,
        reasoning_effort: typing.Optional[ReasoningEffort] = OMIT,
        max_tokens: typing.Optional[int] = OMIT,
        stream: typing.Optional[bool] = OMIT,
        stop: typing.Optional[StopConfigurationParams] = OMIT,
        n: typing.Optional[int] = OMIT,
        seed: typing.Optional[int] = OMIT,
        frequency_penalty: typing.Optional[float] = OMIT,
        presence_penalty: typing.Optional[float] = OMIT,
        wiki_grounding: typing.Optional[bool] = OMIT,
        tools: typing.Optional[typing.Sequence[ChatCompletionToolParams]] = OMIT,
        tool_choice: typing.Optional[ToolChoiceOptionParams] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> typing.Union[HttpResponse[CreateChatCompletionResponse], typing.Iterator[ChatCompletionChunk]]:
        """
        Parameters
        ----------
        messages : typing.Sequence[ChatCompletionRequestMessageParams]
            A list of messages comprising the conversation so far.

        model : SarvamModelIds
            Model ID used to generate the response, like `sarvam-m`.

        temperature : typing.Optional[float]
            What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
            We generally recommend altering this or `top_p` but not both.

        top_p : typing.Optional[float]
            An alternative to sampling with temperature, called nucleus sampling,
            where the model considers the results of the tokens with top_p probability
            mass. So 0.1 means only the tokens comprising the top 10% probability mass
            are considered.

            We generally recommend altering this or `temperature` but not both.

        reasoning_effort : typing.Optional[ReasoningEffort]
            The effort to use for reasoning

        max_tokens : typing.Optional[int]
            The maximum number of tokens that can be generated in the chat completion.

        stream : typing.Optional[bool]
            If set to true, the model response data will be streamed to the client
            as it is generated using [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
            When true, returns an Iterator[ChatCompletionChunk] instead of HttpResponse.

        stop : typing.Optional[StopConfigurationParams]

        n : typing.Optional[int]
            How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep `n` as `1` to minimize costs.

        seed : typing.Optional[int]
            This feature is in Beta.
            If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same `seed` and parameters should return the same result.
            Determinism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend.

        frequency_penalty : typing.Optional[float]
            Number between -2.0 and 2.0. Positive values penalize new tokens based on
            their existing frequency in the text so far, decreasing the model's
            likelihood to repeat the same line verbatim.

        presence_penalty : typing.Optional[float]
            Number between -2.0 and 2.0. Positive values penalize new tokens based on
            whether they appear in the text so far, increasing the model's likelihood
            to talk about new topics.

        wiki_grounding : typing.Optional[bool]
            If set to true, the model response will be wiki grounded.

        tools : typing.Optional[typing.Sequence[ChatCompletionToolParams]]
            A list of tools the model may call. Currently, only functions are supported as a tool.

        tool_choice : typing.Optional[ToolChoiceOptionParams]
            Controls which (if any) tool is called by the model.

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        HttpResponse[CreateChatCompletionResponse] or Iterator[ChatCompletionChunk]
            When stream=False (default): HttpResponse wrapping CreateChatCompletionResponse.
            When stream=True: Iterator yielding ChatCompletionChunk objects.
        """
        if stream is True:
            return self._completions_stream(
                messages=messages,
                model=model,
                temperature=temperature,
                top_p=top_p,
                reasoning_effort=reasoning_effort,
                max_tokens=max_tokens,
                stop=stop,
                n=n,
                seed=seed,
                frequency_penalty=frequency_penalty,
                presence_penalty=presence_penalty,
                wiki_grounding=wiki_grounding,
                tools=tools,
                tool_choice=tool_choice,
                request_options=request_options,
            )

        _response = self._client_wrapper.httpx_client.request(
            "v1/chat/completions",
            base_url=self._client_wrapper.get_environment().base,
            method="POST",
            json={
                "messages": convert_and_respect_annotation_metadata(
                    object_=messages, annotation=typing.Sequence[ChatCompletionRequestMessageParams], direction="write"
                ),
                "model": model,
                "temperature": temperature,
                "top_p": top_p,
                "reasoning_effort": reasoning_effort,
                "max_tokens": max_tokens,
                "stream": stream,
                "stop": convert_and_respect_annotation_metadata(
                    object_=stop, annotation=StopConfigurationParams, direction="write"
                ),
                "n": n,
                "seed": seed,
                "frequency_penalty": frequency_penalty,
                "presence_penalty": presence_penalty,
                "wiki_grounding": wiki_grounding,
                "tools": convert_and_respect_annotation_metadata(
                    object_=tools, annotation=typing.Sequence[ChatCompletionToolParams], direction="write"
                ),
                "tool_choice": convert_and_respect_annotation_metadata(
                    object_=tool_choice, annotation=ToolChoiceOptionParams, direction="write"
                ),
            },
            headers={
                "content-type": "application/json",
            },
            request_options=request_options,
            omit=OMIT,
        )
        try:
            if 200 <= _response.status_code < 300:
                _data = typing.cast(
                    CreateChatCompletionResponse,
                    parse_obj_as(
                        type_=CreateChatCompletionResponse,  # type: ignore
                        object_=_response.json(),
                    ),
                )
                return HttpResponse(response=_response, data=_data)
            if _response.status_code == 400:
                raise BadRequestError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 403:
                raise ForbiddenError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 422:
                raise UnprocessableEntityError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 429:
                raise TooManyRequestsError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 500:
                raise InternalServerError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            _response_json = _response.json()
        except JSONDecodeError:
            raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
        raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)

    def _completions_stream(
        self,
        *,
        messages: typing.Sequence[ChatCompletionRequestMessageParams],
        model: SarvamModelIds,
        temperature: typing.Optional[float] = OMIT,
        top_p: typing.Optional[float] = OMIT,
        reasoning_effort: typing.Optional[ReasoningEffort] = OMIT,
        max_tokens: typing.Optional[int] = OMIT,
        stop: typing.Optional[StopConfigurationParams] = OMIT,
        n: typing.Optional[int] = OMIT,
        seed: typing.Optional[int] = OMIT,
        frequency_penalty: typing.Optional[float] = OMIT,
        presence_penalty: typing.Optional[float] = OMIT,
        wiki_grounding: typing.Optional[bool] = OMIT,
        tools: typing.Optional[typing.Sequence[ChatCompletionToolParams]] = OMIT,
        tool_choice: typing.Optional[ToolChoiceOptionParams] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> typing.Iterator[ChatCompletionChunk]:
        with self._client_wrapper.httpx_client.stream(
            "v1/chat/completions",
            base_url=self._client_wrapper.get_environment().base,
            method="POST",
            json={
                "messages": convert_and_respect_annotation_metadata(
                    object_=messages, annotation=typing.Sequence[ChatCompletionRequestMessageParams], direction="write"
                ),
                "model": model,
                "temperature": temperature,
                "top_p": top_p,
                "reasoning_effort": reasoning_effort,
                "max_tokens": max_tokens,
                "stream": True,
                "stop": convert_and_respect_annotation_metadata(
                    object_=stop, annotation=StopConfigurationParams, direction="write"
                ),
                "n": n,
                "seed": seed,
                "frequency_penalty": frequency_penalty,
                "presence_penalty": presence_penalty,
                "wiki_grounding": wiki_grounding,
                "tools": convert_and_respect_annotation_metadata(
                    object_=tools, annotation=typing.Sequence[ChatCompletionToolParams], direction="write"
                ),
                "tool_choice": convert_and_respect_annotation_metadata(
                    object_=tool_choice, annotation=ToolChoiceOptionParams, direction="write"
                ),
            },
            headers={
                "content-type": "application/json",
            },
            request_options=request_options,
            omit=OMIT,
        ) as _response:
            if not (200 <= _response.status_code < 300):
                _response.read()
                try:
                    _body = _response.json()
                except Exception:
                    _body = _response.text
                raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_body)

            for _line in _response.iter_lines():
                if not _line:
                    continue
                if _line.startswith("data: "):
                    _data_str = _line[len("data: "):]
                    if _data_str.strip() == "[DONE]":
                        return
                    try:
                        _chunk_json = json.loads(_data_str)
                        _chunk = typing.cast(
                            ChatCompletionChunk,
                            parse_obj_as(
                                type_=ChatCompletionChunk,  # type: ignore
                                object_=_chunk_json,
                            ),
                        )
                        yield _chunk
                    except json.JSONDecodeError:
                        continue


class AsyncRawChatClient:
    def __init__(self, *, client_wrapper: AsyncClientWrapper):
        self._client_wrapper = client_wrapper

    @typing.overload
    async def completions(
        self,
        *,
        messages: typing.Sequence[ChatCompletionRequestMessageParams],
        model: SarvamModelIds,
        temperature: typing.Optional[float] = ...,
        top_p: typing.Optional[float] = ...,
        reasoning_effort: typing.Optional[ReasoningEffort] = ...,
        max_tokens: typing.Optional[int] = ...,
        stream: typing.Literal[True],
        stop: typing.Optional[StopConfigurationParams] = ...,
        n: typing.Optional[int] = ...,
        seed: typing.Optional[int] = ...,
        frequency_penalty: typing.Optional[float] = ...,
        presence_penalty: typing.Optional[float] = ...,
        wiki_grounding: typing.Optional[bool] = ...,
        tools: typing.Optional[typing.Sequence[ChatCompletionToolParams]] = ...,
        tool_choice: typing.Optional[ToolChoiceOptionParams] = ...,
        request_options: typing.Optional[RequestOptions] = ...,
    ) -> typing.AsyncIterator[ChatCompletionChunk]: ...

    @typing.overload
    async def completions(
        self,
        *,
        messages: typing.Sequence[ChatCompletionRequestMessageParams],
        model: SarvamModelIds,
        temperature: typing.Optional[float] = ...,
        top_p: typing.Optional[float] = ...,
        reasoning_effort: typing.Optional[ReasoningEffort] = ...,
        max_tokens: typing.Optional[int] = ...,
        stream: typing.Optional[typing.Literal[False]] = ...,
        stop: typing.Optional[StopConfigurationParams] = ...,
        n: typing.Optional[int] = ...,
        seed: typing.Optional[int] = ...,
        frequency_penalty: typing.Optional[float] = ...,
        presence_penalty: typing.Optional[float] = ...,
        wiki_grounding: typing.Optional[bool] = ...,
        tools: typing.Optional[typing.Sequence[ChatCompletionToolParams]] = ...,
        tool_choice: typing.Optional[ToolChoiceOptionParams] = ...,
        request_options: typing.Optional[RequestOptions] = ...,
    ) -> AsyncHttpResponse[CreateChatCompletionResponse]: ...

    async def completions(
        self,
        *,
        messages: typing.Sequence[ChatCompletionRequestMessageParams],
        model: SarvamModelIds,
        temperature: typing.Optional[float] = OMIT,
        top_p: typing.Optional[float] = OMIT,
        reasoning_effort: typing.Optional[ReasoningEffort] = OMIT,
        max_tokens: typing.Optional[int] = OMIT,
        stream: typing.Optional[bool] = OMIT,
        stop: typing.Optional[StopConfigurationParams] = OMIT,
        n: typing.Optional[int] = OMIT,
        seed: typing.Optional[int] = OMIT,
        frequency_penalty: typing.Optional[float] = OMIT,
        presence_penalty: typing.Optional[float] = OMIT,
        wiki_grounding: typing.Optional[bool] = OMIT,
        tools: typing.Optional[typing.Sequence[ChatCompletionToolParams]] = OMIT,
        tool_choice: typing.Optional[ToolChoiceOptionParams] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> typing.Union[AsyncHttpResponse[CreateChatCompletionResponse], typing.AsyncIterator[ChatCompletionChunk]]:
        """
        Parameters
        ----------
        messages : typing.Sequence[ChatCompletionRequestMessageParams]
            A list of messages comprising the conversation so far.

        model : SarvamModelIds
            Model ID used to generate the response, like `sarvam-m`.

        temperature : typing.Optional[float]
            What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
            We generally recommend altering this or `top_p` but not both.

        top_p : typing.Optional[float]
            An alternative to sampling with temperature, called nucleus sampling,
            where the model considers the results of the tokens with top_p probability
            mass. So 0.1 means only the tokens comprising the top 10% probability mass
            are considered.

            We generally recommend altering this or `temperature` but not both.

        reasoning_effort : typing.Optional[ReasoningEffort]
            The effort to use for reasoning

        max_tokens : typing.Optional[int]
            The maximum number of tokens that can be generated in the chat completion.

        stream : typing.Optional[bool]
            If set to true, the model response data will be streamed to the client
            as it is generated using [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
            When true, returns an AsyncIterator[ChatCompletionChunk] instead of AsyncHttpResponse.

        stop : typing.Optional[StopConfigurationParams]

        n : typing.Optional[int]
            How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep `n` as `1` to minimize costs.

        seed : typing.Optional[int]
            This feature is in Beta.
            If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same `seed` and parameters should return the same result.
            Determinism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend.

        frequency_penalty : typing.Optional[float]
            Number between -2.0 and 2.0. Positive values penalize new tokens based on
            their existing frequency in the text so far, decreasing the model's
            likelihood to repeat the same line verbatim.

        presence_penalty : typing.Optional[float]
            Number between -2.0 and 2.0. Positive values penalize new tokens based on
            whether they appear in the text so far, increasing the model's likelihood
            to talk about new topics.

        wiki_grounding : typing.Optional[bool]
            If set to true, the model response will be wiki grounded.

        tools : typing.Optional[typing.Sequence[ChatCompletionToolParams]]
            A list of tools the model may call. Currently, only functions are supported as a tool.

        tool_choice : typing.Optional[ToolChoiceOptionParams]
            Controls which (if any) tool is called by the model.

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        AsyncHttpResponse[CreateChatCompletionResponse] or AsyncIterator[ChatCompletionChunk]
            When stream=False (default): AsyncHttpResponse wrapping CreateChatCompletionResponse.
            When stream=True: AsyncIterator yielding ChatCompletionChunk objects.
        """
        if stream is True:
            return self._completions_stream(
                messages=messages,
                model=model,
                temperature=temperature,
                top_p=top_p,
                reasoning_effort=reasoning_effort,
                max_tokens=max_tokens,
                stop=stop,
                n=n,
                seed=seed,
                frequency_penalty=frequency_penalty,
                presence_penalty=presence_penalty,
                wiki_grounding=wiki_grounding,
                tools=tools,
                tool_choice=tool_choice,
                request_options=request_options,
            )

        _response = await self._client_wrapper.httpx_client.request(
            "v1/chat/completions",
            base_url=self._client_wrapper.get_environment().base,
            method="POST",
            json={
                "messages": convert_and_respect_annotation_metadata(
                    object_=messages, annotation=typing.Sequence[ChatCompletionRequestMessageParams], direction="write"
                ),
                "model": model,
                "temperature": temperature,
                "top_p": top_p,
                "reasoning_effort": reasoning_effort,
                "max_tokens": max_tokens,
                "stream": stream,
                "stop": convert_and_respect_annotation_metadata(
                    object_=stop, annotation=StopConfigurationParams, direction="write"
                ),
                "n": n,
                "seed": seed,
                "frequency_penalty": frequency_penalty,
                "presence_penalty": presence_penalty,
                "wiki_grounding": wiki_grounding,
                "tools": convert_and_respect_annotation_metadata(
                    object_=tools, annotation=typing.Sequence[ChatCompletionToolParams], direction="write"
                ),
                "tool_choice": convert_and_respect_annotation_metadata(
                    object_=tool_choice, annotation=ToolChoiceOptionParams, direction="write"
                ),
            },
            headers={
                "content-type": "application/json",
            },
            request_options=request_options,
            omit=OMIT,
        )
        try:
            if 200 <= _response.status_code < 300:
                _data = typing.cast(
                    CreateChatCompletionResponse,
                    parse_obj_as(
                        type_=CreateChatCompletionResponse,  # type: ignore
                        object_=_response.json(),
                    ),
                )
                return AsyncHttpResponse(response=_response, data=_data)
            if _response.status_code == 400:
                raise BadRequestError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 403:
                raise ForbiddenError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 422:
                raise UnprocessableEntityError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 429:
                raise TooManyRequestsError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 500:
                raise InternalServerError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            _response_json = _response.json()
        except JSONDecodeError:
            raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
        raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)

    async def _completions_stream(
        self,
        *,
        messages: typing.Sequence[ChatCompletionRequestMessageParams],
        model: SarvamModelIds,
        temperature: typing.Optional[float] = OMIT,
        top_p: typing.Optional[float] = OMIT,
        reasoning_effort: typing.Optional[ReasoningEffort] = OMIT,
        max_tokens: typing.Optional[int] = OMIT,
        stop: typing.Optional[StopConfigurationParams] = OMIT,
        n: typing.Optional[int] = OMIT,
        seed: typing.Optional[int] = OMIT,
        frequency_penalty: typing.Optional[float] = OMIT,
        presence_penalty: typing.Optional[float] = OMIT,
        wiki_grounding: typing.Optional[bool] = OMIT,
        tools: typing.Optional[typing.Sequence[ChatCompletionToolParams]] = OMIT,
        tool_choice: typing.Optional[ToolChoiceOptionParams] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> typing.AsyncIterator[ChatCompletionChunk]:
        async with self._client_wrapper.httpx_client.stream(
            "v1/chat/completions",
            base_url=self._client_wrapper.get_environment().base,
            method="POST",
            json={
                "messages": convert_and_respect_annotation_metadata(
                    object_=messages, annotation=typing.Sequence[ChatCompletionRequestMessageParams], direction="write"
                ),
                "model": model,
                "temperature": temperature,
                "top_p": top_p,
                "reasoning_effort": reasoning_effort,
                "max_tokens": max_tokens,
                "stream": True,
                "stop": convert_and_respect_annotation_metadata(
                    object_=stop, annotation=StopConfigurationParams, direction="write"
                ),
                "n": n,
                "seed": seed,
                "frequency_penalty": frequency_penalty,
                "presence_penalty": presence_penalty,
                "wiki_grounding": wiki_grounding,
                "tools": convert_and_respect_annotation_metadata(
                    object_=tools, annotation=typing.Sequence[ChatCompletionToolParams], direction="write"
                ),
                "tool_choice": convert_and_respect_annotation_metadata(
                    object_=tool_choice, annotation=ToolChoiceOptionParams, direction="write"
                ),
            },
            headers={
                "content-type": "application/json",
            },
            request_options=request_options,
            omit=OMIT,
        ) as _response:
            if not (200 <= _response.status_code < 300):
                await _response.aread()
                try:
                    _body = _response.json()
                except Exception:
                    _body = _response.text
                raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_body)

            async for _line in _response.aiter_lines():
                if not _line:
                    continue
                if _line.startswith("data: "):
                    _data_str = _line[len("data: "):]
                    if _data_str.strip() == "[DONE]":
                        return
                    try:
                        _chunk_json = json.loads(_data_str)
                        _chunk = typing.cast(
                            ChatCompletionChunk,
                            parse_obj_as(
                                type_=ChatCompletionChunk,  # type: ignore
                                object_=_chunk_json,
                            ),
                        )
                        yield _chunk
                    except json.JSONDecodeError:
                        continue
