core/homeassistant/components/cloud/entity.py

"""Helpers for cloud LLM chat handling."""

import base64
from collections.abc import AsyncGenerator, Callable, Iterable
from enum import Enum
import json
import logging
import re
from typing import Any, Literal, cast

from hass_nabucasa import Cloud, NabuCasaBaseError
from hass_nabucasa.llm import (
    LLMAuthenticationError,
    LLMRateLimitError,
    LLMResponseError,
    LLMServiceError,
)
from litellm import (
    ResponseFunctionToolCall,
    ResponseInputParam,
    ResponsesAPIStreamEvents,
)
from openai.types.responses import (
    FunctionToolParam,
    ResponseInputItemParam,
    ResponseReasoningItem,
    ToolParam,
    WebSearchToolParam,
)
from openai.types.responses.response_input_param import (
    ImageGenerationCall as ImageGenerationCallParam,
)
from openai.types.responses.response_output_item import ImageGenerationCall
import voluptuous as vol
from voluptuous_openapi import convert

from homeassistant.components import conversation
from homeassistant.config_entries import ConfigEntry
from homeassistant.exceptions import HomeAssistantError
from homeassistant.helpers import llm
from homeassistant.helpers.entity import Entity
from homeassistant.util import slugify

from .client import CloudClient

_LOGGER = logging.getLogger(__name__)

_MAX_TOOL_ITERATIONS = 10


class ResponseItemType(str, Enum):
    """Response item types."""

    FUNCTION_CALL = "function_call"
    MESSAGE = "message"
    REASONING = "reasoning"
    WEB_SEARCH_CALL = "web_search_call"
    IMAGE = "image"


def _convert_content_to_param(
    chat_content: Iterable[conversation.Content],
) -> ResponseInputParam:
    """Convert any native chat message for this agent to the native format."""
    messages: ResponseInputParam = []
    reasoning_summary: list[str] = []
    web_search_calls: dict[str, dict[str, Any]] = {}

    for content in chat_content:
        if isinstance(content, conversation.ToolResultContent):
            if (
                content.tool_name == "web_search_call"
                and content.tool_call_id in web_search_calls
            ):
                web_search_call = web_search_calls.pop(content.tool_call_id)
                web_search_call["status"] = content.tool_result.get(
                    "status", "completed"
                )
                messages.append(cast("ResponseInputItemParam", web_search_call))
            else:
                messages.append(
                    {
                        "type": "function_call_output",
                        "call_id": content.tool_call_id,
                        "output": json.dumps(content.tool_result),
                    }
                )
            continue

        if content.content:
            role: Literal["user", "assistant", "system", "developer"] = content.role
            if role == "system":
                role = "developer"
            messages.append(
                {"type": "message", "role": role, "content": content.content}
            )

        if isinstance(content, conversation.AssistantContent):
            if content.tool_calls:
                for tool_call in content.tool_calls:
                    if (
                        tool_call.external
                        and tool_call.tool_name == "web_search_call"
                        and "action" in tool_call.tool_args
                    ):
                        web_search_calls[tool_call.id] = {
                            "type": "web_search_call",
                            "id": tool_call.id,
                            "action": tool_call.tool_args["action"],
                            "status": "completed",
                        }
                    else:
                        messages.append(
                            {
                                "type": "function_call",
                                "name": tool_call.tool_name,
                                "arguments": json.dumps(tool_call.tool_args),
                                "call_id": tool_call.id,
                            }
                        )

            if content.thinking_content:
                reasoning_summary.append(content.thinking_content)

            if isinstance(content.native, ResponseReasoningItem):
                messages.append(
                    {
                        "type": "reasoning",
                        "id": content.native.id,
                        "summary": (
                            [
                                {
                                    "type": "summary_text",
                                    "text": summary,
                                }
                                for summary in reasoning_summary
                            ]
                            if content.thinking_content
                            else []
                        ),
                        "encrypted_content": content.native.encrypted_content,
                    }
                )
                reasoning_summary = []

            elif isinstance(content.native, ImageGenerationCall):
                messages.append(
                    cast(ImageGenerationCallParam, content.native.to_dict())
                )

    return messages


def _format_tool(
    tool: llm.Tool,
    custom_serializer: Callable[[Any], Any] | None,
) -> ToolParam:
    """Format a Home Assistant tool for the OpenAI Responses API."""
    parameters = convert(tool.parameters, custom_serializer=custom_serializer)

    spec: FunctionToolParam = {
        "type": "function",
        "name": tool.name,
        "strict": False,
        "description": tool.description,
        "parameters": parameters,
    }

    return spec


def _adjust_schema(schema: dict[str, Any]) -> None:
    """Adjust the schema to be compatible with OpenAI API."""
    if schema["type"] == "object":
        schema.setdefault("strict", True)
        schema.setdefault("additionalProperties", False)
        if "properties" not in schema:
            return

        if "required" not in schema:
            schema["required"] = []

        # Ensure all properties are required
        for prop, prop_info in schema["properties"].items():
            _adjust_schema(prop_info)
            if prop not in schema["required"]:
                prop_info["type"] = [prop_info["type"], "null"]
                schema["required"].append(prop)

    elif schema["type"] == "array":
        if "items" not in schema:
            return

        _adjust_schema(schema["items"])


def _format_structured_output(
    schema: vol.Schema, llm_api: llm.APIInstance | None
) -> dict[str, Any]:
    """Format the schema to be compatible with OpenAI API."""
    result: dict[str, Any] = convert(
        schema,
        custom_serializer=(
            llm_api.custom_serializer if llm_api else llm.selector_serializer
        ),
    )

    _ensure_schema_constraints(result)

    return result


def _ensure_schema_constraints(schema: dict[str, Any]) -> None:
    """Ensure generated schemas match the Responses API expectations."""
    schema_type = schema.get("type")

    if schema_type == "object":
        schema.setdefault("additionalProperties", False)
        properties = schema.get("properties")
        if isinstance(properties, dict):
            for property_schema in properties.values():
                if isinstance(property_schema, dict):
                    _ensure_schema_constraints(property_schema)
    elif schema_type == "array":
        items = schema.get("items")
        if isinstance(items, dict):
            _ensure_schema_constraints(items)


# Borrowed and adapted from openai_conversation component
async def _transform_stream(  # noqa: C901 - This is complex, but better to have it in one place
    chat_log: conversation.ChatLog,
    stream: Any,
    remove_citations: bool = False,
) -> AsyncGenerator[
    conversation.AssistantContentDeltaDict | conversation.ToolResultContentDeltaDict
]:
    """Transform stream result into HA format."""
    last_summary_index = None
    last_role: Literal["assistant", "tool_result"] | None = None
    current_tool_call: ResponseFunctionToolCall | None = None

    # Non-reasoning models don't follow our request to remove citations, so we remove
    # them manually here. They always follow the same pattern: the citation is always
    # in parentheses in Markdown format, the citation is always in a single delta event,
    # and sometimes the closing parenthesis is split into a separate delta event.
    remove_parentheses: bool = False
    citation_regexp = re.compile(r"\(\[([^\]]+)\]\((https?:\/\/[^\)]+)\)")

    async for event in stream:
        event_type = getattr(event, "type", None)
        event_item = getattr(event, "item", None)
        event_item_type = getattr(event_item, "type", None) if event_item else None

        _LOGGER.debug(
            "Event[%s] | item: %s",
            event_type,
            event_item_type,
        )

        if event_type == ResponsesAPIStreamEvents.OUTPUT_ITEM_ADDED:
            # Detect function_call even when it's a BaseLiteLLMOpenAIResponseObject
            if event_item_type == ResponseItemType.FUNCTION_CALL:
                # OpenAI has tool calls as individual events
                # while HA puts tool calls inside the assistant message.
                # We turn them into individual assistant content for HA
                # to ensure that tools are called as soon as possible.
                yield {"role": "assistant"}
                last_role = "assistant"
                last_summary_index = None
                current_tool_call = cast(ResponseFunctionToolCall, event.item)
            elif (
                event_item_type == ResponseItemType.MESSAGE
                or (
                    event_item_type == ResponseItemType.REASONING
                    and last_summary_index is not None
                )  # Subsequent ResponseReasoningItem
                or last_role != "assistant"
            ):
                yield {"role": "assistant"}
                last_role = "assistant"
                last_summary_index = None

        elif event_type == ResponsesAPIStreamEvents.OUTPUT_ITEM_DONE:
            if event_item_type == ResponseItemType.REASONING:
                encrypted_content = getattr(event.item, "encrypted_content", None)
                summary = getattr(event.item, "summary", []) or []

                yield {
                    "native": ResponseReasoningItem(
                        type="reasoning",
                        id=event.item.id,
                        summary=[],
                        encrypted_content=encrypted_content,
                    )
                }

                last_summary_index = len(summary) - 1 if summary else None
            elif event_item_type == ResponseItemType.WEB_SEARCH_CALL:
                action = getattr(event.item, "action", None)
                if isinstance(action, dict):
                    action_dict = action
                elif action is not None:
                    action_dict = action.to_dict()
                else:
                    action_dict = {}
                yield {
                    "tool_calls": [
                        llm.ToolInput(
                            id=event.item.id,
                            tool_name="web_search_call",
                            tool_args={"action": action_dict},
                            external=True,
                        )
                    ]
                }
                yield {
                    "role": "tool_result",
                    "tool_call_id": event.item.id,
                    "tool_name": "web_search_call",
                    "tool_result": {"status": event.item.status},
                }
                last_role = "tool_result"
            elif event_item_type == ResponseItemType.IMAGE:
                yield {"native": event.item}
                last_summary_index = -1  # Trigger new assistant message on next turn

        elif event_type == ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA:
            data = event.delta
            if remove_parentheses:
                data = data.removeprefix(")")
                remove_parentheses = False
            elif remove_citations and (match := citation_regexp.search(data)):
                match_start, match_end = match.span()
                # remove leading space if any
                if data[match_start - 1 : match_start] == " ":
                    match_start -= 1
                # remove closing parenthesis:
                if data[match_end : match_end + 1] == ")":
                    match_end += 1
                else:
                    remove_parentheses = True
                data = data[:match_start] + data[match_end:]
            if data:
                yield {"content": data}

        elif event_type == ResponsesAPIStreamEvents.REASONING_SUMMARY_TEXT_DELTA:
            # OpenAI can output several reasoning summaries
            # in a single ResponseReasoningItem. We split them as separate
            # AssistantContent messages. Only last of them will have
            # the reasoning `native` field set.
            if (
                last_summary_index is not None
                and event.summary_index != last_summary_index
            ):
                yield {"role": "assistant"}
                last_role = "assistant"
            last_summary_index = event.summary_index
            yield {"thinking_content": event.delta}

        elif event_type == ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DELTA:
            if current_tool_call is not None:
                current_tool_call.arguments += event.delta

        elif event_type == ResponsesAPIStreamEvents.WEB_SEARCH_CALL_SEARCHING:
            yield {"role": "assistant"}

        elif event_type == ResponsesAPIStreamEvents.FUNCTION_CALL_ARGUMENTS_DONE:
            if current_tool_call is not None:
                current_tool_call.status = "completed"

                raw_args = json.loads(current_tool_call.arguments)
                for key in ("area", "floor"):
                    if key in raw_args and not raw_args[key]:
                        # Remove keys that are "" or None
                        raw_args.pop(key, None)

                yield {
                    "tool_calls": [
                        llm.ToolInput(
                            id=current_tool_call.call_id,
                            tool_name=current_tool_call.name,
                            tool_args=raw_args,
                        )
                    ]
                }

        elif event_type == ResponsesAPIStreamEvents.RESPONSE_COMPLETED:
            if event.response.usage is not None:
                chat_log.async_trace(
                    {
                        "stats": {
                            "input_tokens": event.response.usage.input_tokens,
                            "output_tokens": event.response.usage.output_tokens,
                        }
                    }
                )

        elif event_type == ResponsesAPIStreamEvents.RESPONSE_INCOMPLETE:
            if event.response.usage is not None:
                chat_log.async_trace(
                    {
                        "stats": {
                            "input_tokens": event.response.usage.input_tokens,
                            "output_tokens": event.response.usage.output_tokens,
                        }
                    }
                )

            if (
                event.response.incomplete_details
                and event.response.incomplete_details.reason
            ):
                reason: str = event.response.incomplete_details.reason
            else:
                reason = "unknown reason"

            if reason == "max_output_tokens":
                reason = "max output tokens reached"
            elif reason == "content_filter":
                reason = "content filter triggered"

            raise HomeAssistantError(f"OpenAI response incomplete: {reason}")

        elif event_type == ResponsesAPIStreamEvents.RESPONSE_FAILED:
            if event.response.usage is not None:
                chat_log.async_trace(
                    {
                        "stats": {
                            "input_tokens": event.response.usage.input_tokens,
                            "output_tokens": event.response.usage.output_tokens,
                        }
                    }
                )
            reason = "unknown reason"
            if event.response.error is not None:
                reason = event.response.error.message
            raise HomeAssistantError(f"OpenAI response failed: {reason}")

        elif event_type == ResponsesAPIStreamEvents.ERROR:
            raise HomeAssistantError(f"OpenAI response error: {event.message}")


class BaseCloudLLMEntity(Entity):
    """Cloud LLM conversation agent."""

    def __init__(self, cloud: Cloud[CloudClient], config_entry: ConfigEntry) -> None:
        """Initialize the entity."""
        self._cloud = cloud
        self._entry = config_entry

    async def _prepare_chat_for_generation(
        self,
        chat_log: conversation.ChatLog,
        messages: ResponseInputParam,
        response_format: dict[str, Any] | None = None,
    ) -> dict[str, Any]:
        """Prepare kwargs for Cloud LLM from the chat log."""

        last_content: Any = chat_log.content[-1]
        if last_content.role == "user" and last_content.attachments:
            files = await self._async_prepare_files_for_prompt(last_content.attachments)
            current_content = last_content.content
            last_content = [*(current_content or []), *files]

        tools: list[ToolParam] = []
        tool_choice: str | None = None

        if chat_log.llm_api:
            ha_tools: list[ToolParam] = [
                _format_tool(tool, chat_log.llm_api.custom_serializer)
                for tool in chat_log.llm_api.tools
            ]

            if ha_tools:
                if not chat_log.unresponded_tool_results:
                    tools = ha_tools
                    tool_choice = "auto"
                else:
                    tools = []
                    tool_choice = "none"

        web_search = WebSearchToolParam(
            type="web_search",
            search_context_size="medium",
        )
        tools.append(web_search)

        response_kwargs: dict[str, Any] = {
            "messages": messages,
            "conversation_id": chat_log.conversation_id,
        }

        if response_format is not None:
            response_kwargs["response_format"] = response_format
        if tools is not None:
            response_kwargs["tools"] = tools
        if tool_choice is not None:
            response_kwargs["tool_choice"] = tool_choice

        response_kwargs["stream"] = True

        return response_kwargs

    async def _async_prepare_files_for_prompt(
        self,
        attachments: list[conversation.Attachment],
    ) -> list[dict[str, Any]]:
        """Prepare files for multimodal prompts."""

        def prepare() -> list[dict[str, Any]]:
            content: list[dict[str, Any]] = []
            for attachment in attachments:
                mime_type = attachment.mime_type
                path = attachment.path
                if not path.exists():
                    raise HomeAssistantError(f"`{path}` does not exist")

                data = base64.b64encode(path.read_bytes()).decode("utf-8")
                if mime_type and mime_type.startswith("image/"):
                    content.append(
                        {
                            "type": "input_image",
                            "image_url": f"data:{mime_type};base64,{data}",
                            "detail": "auto",
                        }
                    )
                elif mime_type and mime_type.startswith("application/pdf"):
                    content.append(
                        {
                            "type": "input_file",
                            "filename": str(path.name),
                            "file_data": f"data:{mime_type};base64,{data}",
                        }
                    )
                else:
                    raise HomeAssistantError(
                        "Only images and PDF are currently supported as attachments"
                    )

            return content

        return await self.hass.async_add_executor_job(prepare)

    async def _async_handle_chat_log(
        self,
        type: Literal["ai_task", "conversation"],
        chat_log: conversation.ChatLog,
        structure_name: str | None = None,
        structure: vol.Schema | None = None,
    ) -> None:
        """Generate a response for the chat log."""

        for _ in range(_MAX_TOOL_ITERATIONS):
            response_format: dict[str, Any] | None = None
            if structure and structure_name:
                response_format = {
                    "type": "json_schema",
                    "json_schema": {
                        "name": slugify(structure_name),
                        "schema": _format_structured_output(
                            structure, chat_log.llm_api
                        ),
                        "strict": False,
                    },
                }

            messages = _convert_content_to_param(chat_log.content)

            response_kwargs = await self._prepare_chat_for_generation(
                chat_log,
                messages,
                response_format,
            )

            try:
                if type == "conversation":
                    raw_stream = await self._cloud.llm.async_process_conversation(
                        **response_kwargs,
                    )
                else:
                    raw_stream = await self._cloud.llm.async_generate_data(
                        **response_kwargs,
                    )

                messages.extend(
                    _convert_content_to_param(
                        [
                            content
                            async for content in chat_log.async_add_delta_content_stream(
                                self.entity_id,
                                _transform_stream(
                                    chat_log,
                                    raw_stream,
                                    True,
                                ),
                            )
                        ]
                    )
                )

            except LLMAuthenticationError as err:
                raise HomeAssistantError("Cloud LLM authentication failed") from err
            except LLMRateLimitError as err:
                raise HomeAssistantError("Cloud LLM is rate limited") from err
            except LLMResponseError as err:
                raise HomeAssistantError(str(err)) from err
            except LLMServiceError as err:
                raise HomeAssistantError("Error talking to Cloud LLM") from err
            except NabuCasaBaseError as err:
                raise HomeAssistantError(str(err)) from err

            if not chat_log.unresponded_tool_results:
                break