"""HuggingFace Chat Client adapter for Microsoft Agent Framework. This client enables the use of HuggingFace Inference API (including the free tier) as a backend for the agent framework, allowing "Advanced Mode" to work without an OpenAI API key. """ import asyncio from collections.abc import AsyncIterable, MutableSequence from functools import partial from typing import Any, cast import structlog from agent_framework import ( BaseChatClient, ChatMessage, ChatOptions, ChatResponse, ChatResponseUpdate, ) from huggingface_hub import InferenceClient from src.utils.config import settings logger = structlog.get_logger() class HuggingFaceChatClient(BaseChatClient): # type: ignore[misc] """Adapter for HuggingFace Inference API.""" def __init__( self, model_id: str | None = None, api_key: str | None = None, **kwargs: Any, ) -> None: """Initialize the HuggingFace chat client. Args: model_id: The HuggingFace model ID (default: configured value or Llama-3.1-70B). api_key: HF_TOKEN (optional, defaults to env var). **kwargs: Additional arguments passed to BaseChatClient. """ super().__init__(**kwargs) self.model_id = ( model_id or settings.huggingface_model or "meta-llama/Llama-3.1-70B-Instruct" ) self.api_key = api_key or settings.hf_token # Initialize the HF Inference Client # timeout=60 to prevent premature timeouts on long reasonings self._client = InferenceClient( model=self.model_id, token=self.api_key, timeout=60, ) logger.info("Initialized HuggingFaceChatClient", model=self.model_id) def _convert_messages(self, messages: MutableSequence[ChatMessage]) -> list[dict[str, Any]]: """Convert framework messages to HuggingFace format.""" hf_messages: list[dict[str, Any]] = [] for msg in messages: # Basic conversion - extend as needed for multi-modal content = msg.text or "" # msg.role can be string or enum - extract .value for enums # str(Role.USER) -> "Role.USER" (wrong), Role.USER.value -> "user" (correct) if hasattr(msg.role, "value"): role_str = str(msg.role.value) else: role_str = str(msg.role) hf_messages.append({"role": role_str, "content": content}) return hf_messages async def _inner_get_response( self, *, messages: MutableSequence[ChatMessage], chat_options: ChatOptions, **kwargs: Any, ) -> ChatResponse: """Synchronous response generation using chat_completion.""" hf_messages = self._convert_messages(messages) # Extract tool configuration tools = chat_options.tools if chat_options.tools else None # HF expects 'tool_choice' to be 'auto', 'none', or specific tool # Framework uses ToolMode enum or dict hf_tool_choice: str | None = None if chat_options.tool_choice is not None: tool_choice_str = str(chat_options.tool_choice) if "AUTO" in tool_choice_str: hf_tool_choice = "auto" # For NONE or other, leave as None try: # Use explicit None checks - 'or' treats 0/0.0 as falsy # temperature=0.0 is valid (deterministic output) max_tokens = chat_options.max_tokens if chat_options.max_tokens is not None else 2048 temperature = chat_options.temperature if chat_options.temperature is not None else 0.7 # Use partial to create a callable with keyword args for to_thread call_fn = partial( self._client.chat_completion, messages=hf_messages, tools=tools, tool_choice=hf_tool_choice, max_tokens=max_tokens, temperature=temperature, stream=False, ) response = await asyncio.to_thread(call_fn) # Parse response # HF returns a ChatCompletionOutput choices = response.choices if not choices: return ChatResponse(messages=[], response_id="error-no-choices") choice = choices[0] message_content = choice.message.content or "" # Construct response message with proper kwargs response_msg = ChatMessage( role=cast(Any, choice.message.role), text=message_content, ) return ChatResponse( messages=[response_msg], response_id=response.id or "hf-response", ) except Exception as e: logger.error("HuggingFace API error", error=str(e)) raise async def _inner_get_streaming_response( self, *, messages: MutableSequence[ChatMessage], chat_options: ChatOptions, **kwargs: Any, ) -> AsyncIterable[ChatResponseUpdate]: """Streaming response generation.""" hf_messages = self._convert_messages(messages) tools = chat_options.tools if chat_options.tools else None hf_tool_choice: str | None = None if chat_options.tool_choice is not None: if "AUTO" in str(chat_options.tool_choice): hf_tool_choice = "auto" try: # Use explicit None checks - 'or' treats 0/0.0 as falsy # temperature=0.0 is valid (deterministic output) max_tokens = chat_options.max_tokens if chat_options.max_tokens is not None else 2048 temperature = chat_options.temperature if chat_options.temperature is not None else 0.7 # Use partial for streaming call call_fn = partial( self._client.chat_completion, messages=hf_messages, tools=tools, tool_choice=hf_tool_choice, max_tokens=max_tokens, temperature=temperature, stream=True, ) stream = await asyncio.to_thread(call_fn) for chunk in stream: # Chunk is ChatCompletionStreamOutput if not chunk.choices: continue choice = chunk.choices[0] delta = choice.delta # Convert to ChatResponseUpdate yield ChatResponseUpdate( role=cast(Any, delta.role) if delta.role else None, content=delta.content, ) # Yield control to event loop await asyncio.sleep(0) except Exception as e: logger.error("HuggingFace Streaming error", error=str(e)) raise