diff --git a/docs/tutorial/en/index.rst b/docs/tutorial/en/index.rst index 1685353365..e5ecb2684a 100644 --- a/docs/tutorial/en/index.rst +++ b/docs/tutorial/en/index.rst @@ -33,26 +33,41 @@ Welcome to AgentScope's documentation! .. toctree:: :maxdepth: 1 - :caption: Task Guides + :caption: Model and Context tutorial/task_model tutorial/task_prompt - tutorial/task_tool + tutorial/task_token tutorial/task_memory tutorial/task_long_term_memory + +.. toctree:: + :maxdepth: 1 + :caption: Tool + + tutorial/task_tool + tutorial/task_mcp + tutorial/task_agent_skill + +.. toctree:: + :maxdepth: 1 + :caption: Agent + tutorial/task_agent + tutorial/task_state + tutorial/task_hook + +.. toctree:: + :maxdepth: 1 + :caption: Features + tutorial/task_pipeline tutorial/task_plan tutorial/task_rag - tutorial/task_state - tutorial/task_hook - tutorial/task_mcp - tutorial/task_agent_skill tutorial/task_studio tutorial/task_tracing tutorial/task_eval tutorial/task_embedding - tutorial/task_token tutorial/task_tts .. toctree:: @@ -77,3 +92,4 @@ Welcome to AgentScope's documentation! api/agentscope.tracing api/agentscope.session api/agentscope.exception + api/agentscope.tts diff --git a/docs/tutorial/en/src/task_tts.py b/docs/tutorial/en/src/task_tts.py index 7665f025d9..ced1f3aefa 100644 --- a/docs/tutorial/en/src/task_tts.py +++ b/docs/tutorial/en/src/task_tts.py @@ -5,496 +5,236 @@ TTS ==================== -AgentScope provides a unified TTS (Text-to-Speech) module that supports multiple TTS providers, -enabling agents to convert text responses into audio output. This tutorial demonstrates how to use -TTS models in AgentScope. +AgentScope provides a unified interface for Text-to-Speech (TTS) models across multiple API provides. +This tutorial demonstrates how to use TTS models in AgentScope. -The supported TTS providers include: +AgentScope supports the following TTS APIs: -.. list-table:: +.. list-table:: Built-in TTS Models :header-rows: 1 - * - Provider + * - API - Class - Streaming Input - * - DashScope Realtime + - Non-Streaming Input + - Streaming Output + - Non-Streaming Output + * - DashScope Realtime API - ``DashScopeRealtimeTTSModel`` - ✅ - * - DashScope + - ✅ + - ✅ + - ✅ + * - DashScope API - ``DashScopeTTSModel`` - ❌ - * - OpenAI + - ✅ + - ✅ + - ✅ + * - OpenAI API - ``OpenAITTSModel`` - ❌ - * - Gemini + - ✅ + - ✅ + - ✅ + * - Gemini API - ``GeminiTTSModel`` - ❌ + - ✅ + - ✅ + - ✅ -All TTS models inherit from ``TTSModelBase`` and provide a unified interface: - -- For **realtime TTS models** (supporting streaming input): - - - ``connect()``: Establish connection to the TTS service - - - ``push(msg)``: Append text chunks incrementally (non-blocking) - - - ``synthesize(msg=None)``: Synthesize speech and block until complete - - - ``close()``: Close the connection and clean up resources +.. note:: The streaming input and output in AgentScope TTS models are all accumulative. -- For **non-realtime TTS models**: +**Choosing the Right Model:** - - ``synthesize(msg)``: Synthesize speech from complete text +- **Use Non-Realtime TTS** when you have complete text ready (e.g., pre-written + responses, complete LLM outputs) +- **Use Realtime TTS** when text is generated progressively (e.g., streaming + LLM responses) for lower latency -The TTS models return ``TTSResponse`` objects containing ``AudioBlock`` instances with base64-encoded audio data. """ -# %% -# Basic Usage - Realtime TTS Models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# For realtime TTS models (like ``DashScopeRealtimeTTSModel``), you need to: -# -# 1. Initialize the TTS model with appropriate parameters -# 2. Connect to the TTS service using ``connect()`` -# 3. Use ``synthesize()`` to synthesize complete text, or ``push()`` for incremental text -# 4. Close the connection using ``close()`` -# -# Let's start with a simple example using DashScope Realtime TTS: - import asyncio import os -from typing import AsyncGenerator +from agentscope.agent import ReActAgent, UserAgent +from agentscope.formatter import DashScopeChatFormatter from agentscope.message import Msg +from agentscope.model import DashScopeChatModel from agentscope.tts import ( DashScopeRealtimeTTSModel, DashScopeTTSModel, - OpenAITTSModel, - GeminiTTSModel, - TTSResponse, ) - -async def example_basic_realtime_tts() -> None: - """A basic example of using DashScope Realtime TTS.""" - # Initialize the TTS model - tts_model = DashScopeRealtimeTTSModel( - api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen3-tts-flash-realtime", - voice="Cherry", - stream=False, # Set to False for simpler example - ) - - # Connect to the TTS service - await tts_model.connect() - - # Create a message with text content - msg = Msg( - name="assistant", - content="Hello, this is a test of TTS functionality.", - role="assistant", - ) - - # Synthesize the text (blocking until complete) - tts_response = await tts_model.synthesize(msg) - - # The response contains audio blocks - print(f"TTS Response: {tts_response}") - print(f"Number of audio blocks: {len(tts_response.content)}") - - # Clean up - await tts_model.close() - - -# asyncio.run(example_basic_realtime_tts()) - # %% -# Basic Usage - Non-Realtime TTS Models +# Non-Realtime TTS # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# For non-realtime TTS models (like ``DashScopeTTSModel``, ``OpenAITTSModel``, ``GeminiTTSModel``), -# you can directly call ``synthesize()`` without needing to connect first: +# Non-realtime TTS models process complete text inputs and are the simplest +# to use. You can directly call their ``synthesize()`` method. +# +# Taking DashScope TTS model as an example: -async def example_basic_non_realtime_tts() -> None: +async def example_non_realtime_tts() -> None: """A basic example of using non-realtime TTS models.""" # Example with DashScope TTS - if os.environ.get("DASHSCOPE_API_KEY"): - tts_model = DashScopeTTSModel( - api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen3-tts-flash", - voice="Cherry", - ) - - msg = Msg( - name="assistant", - content="Hello, this is DashScope TTS.", - role="assistant", - ) - - # Directly synthesize without connecting - tts_response = await tts_model.synthesize(msg) - - print(f"TTS Response: {tts_response}") - print(f"Audio blocks: {len(tts_response.content)}") - - # Example with OpenAI TTS - if os.environ.get("OPENAI_API_KEY"): - tts_model = OpenAITTSModel( - api_key=os.environ.get("OPENAI_API_KEY", ""), - model_name="gpt-4o-mini-tts", - voice="alloy", - ) - - msg = Msg( - name="assistant", - content="Hello, this is OpenAI TTS.", - role="assistant", - ) - - tts_response = await tts_model.synthesize(msg) - - print(f"TTS Response: {tts_response}") - print(f"Audio blocks: {len(tts_response.content)}") - - # Example with Gemini TTS - if os.environ.get("GEMINI_API_KEY"): - tts_model = GeminiTTSModel( - api_key=os.environ.get("GEMINI_API_KEY", ""), - model_name="gemini-2.5-flash-preview-tts", - voice="Kore", - ) - - msg = Msg( - name="assistant", - content="Hello, this is Gemini TTS.", - role="assistant", - ) - - tts_response = await tts_model.synthesize(msg) - - print(f"TTS Response: {tts_response}") - print(f"Audio blocks: {len(tts_response.content)}") - - -# asyncio.run(example_basic_non_realtime_tts()) - -# %% -# Using TTS with Agents -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# The most common use case is integrating TTS with agents. AgentScope's ``ReActAgent`` -# supports TTS models through the ``tts_model`` parameter. When a TTS model is provided, -# the agent will automatically synthesize its text responses into audio. -# -# .. note:: The TTS model will be called automatically during agent execution, handling -# streaming text incrementally for models that support streaming input. - - -async def example_agent_with_tts() -> None: - """An example of using TTS with ReActAgent.""" - from agentscope.agent import ReActAgent, UserAgent - from agentscope.formatter import DashScopeChatFormatter - from agentscope.memory import InMemoryMemory - from agentscope.model import DashScopeChatModel - - # Create a TTS model - tts_model = DashScopeRealtimeTTSModel( - api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen3-tts-flash-realtime", - voice="Cherry", - ) - - # Create an agent with TTS enabled - agent = ReActAgent( - name="Assistant", - sys_prompt="You are a helpful assistant.", - model=DashScopeChatModel( - api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen-max", - stream=True, - ), - formatter=DashScopeChatFormatter(), - memory=InMemoryMemory(), - tts_model=tts_model, # Enable TTS - ) - - user = UserAgent("User") - - # The agent will automatically synthesize its responses - msg = await user("Tell me a short story.") - response = await agent(msg) - - print(f"Agent response: {response.get_text_content()}") - - # Clean up - await tts_model.close() - - -# asyncio.run(example_agent_with_tts()) - -# %% -# Streaming Input with Push and Synthesize -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# For realtime TTS models that support streaming input (like ``DashScopeRealtimeTTSModel``), -# you can use ``push()`` to incrementally send text chunks as they arrive, and then -# call ``synthesize()`` to get the final audio output. -# -# - ``push(msg)``: Non-blocking method that appends text and returns any available audio -# - ``synthesize(msg=None)``: Blocking method that waits for all audio to be synthesized -# -# .. note:: The ``push()`` method uses the message ID (``msg.id``) to track streaming -# input requests. All chunks for the same message must have the same ID. - - -async def example_streaming_push_synthesize() -> None: - """An example of using push() and synthesize() for streaming input.""" - tts_model = DashScopeRealtimeTTSModel( + tts_model = DashScopeTTSModel( api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen3-tts-flash-realtime", + model_name="qwen3-tts-flash", voice="Cherry", - stream=False, # Set to False for simpler example + stream=False, # Non-streaming output ) - await tts_model.connect() - - # Simulate streaming text generation - text_chunks = [ - "Hello, ", - "this is ", - "a streaming ", - "TTS example.", - ] - - # Create a message with a consistent ID for all chunks - msg_id = "streaming_msg_001" - accumulated_text = "" - - for i, chunk in enumerate(text_chunks): - # Accumulate text incrementally - accumulated_text += chunk - - # Create a message with accumulated text and same ID - msg = Msg( - name="assistant", - content=accumulated_text, - role="assistant", - ) - msg.id = msg_id # Important: same ID for all chunks - - # Push the incremental text (non-blocking) - tts_response = await tts_model.push(msg) - if tts_response.content: - print( - f"Chunk {i+1}: Received {len(tts_response.content)} audio blocks", - ) - - # Finalize synthesis to get all remaining audio - final_msg = Msg( + msg = Msg( name="assistant", - content=accumulated_text, + content="Hello, this is DashScope TTS.", role="assistant", ) - final_msg.id = msg_id - final_response = await tts_model.synthesize(final_msg) - # Handle both TTSResponse and AsyncGenerator cases - if isinstance(final_response, AsyncGenerator): - async for chunk in final_response: - if chunk.content: - print( - f"Final synthesis chunk: {len(chunk.content)} audio blocks", - ) - else: - print(f"Final synthesis: {len(final_response.content)} audio blocks") + # Directly synthesize without connecting + tts_response = await tts_model.synthesize(msg) - await tts_model.close() + # tts_response.content contains an audio block with base64-encoded audio data + print( + "The length of audio data:", + len(tts_response.content[0]["source"]["data"]), + ) -# asyncio.run(example_streaming_push_synthesize()) +asyncio.run(example_non_realtime_tts()) # %% -# Streaming Output Mode -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# TTS models support streaming output mode, where audio is returned as an async generator -# of ``TTSResponse`` objects. This is useful for real-time audio playback. +# **Streaming Output for Lower Latency:** +# +# When ``stream=True``, the model returns audio chunks progressively, allowing +# you to start playback before synthesis completes. This reduces perceived latency. # -# Set ``stream=True`` when initializing the TTS model to enable streaming output: -async def example_streaming_output() -> None: - """An example of using streaming output mode.""" - tts_model = DashScopeRealtimeTTSModel( +async def example_non_realtime_tts_streaming() -> None: + """An example of using non-realtime TTS models with streaming output.""" + # Example with DashScope TTS with streaming output + tts_model = DashScopeTTSModel( api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen3-tts-flash-realtime", + model_name="qwen3-tts-flash", voice="Cherry", stream=True, # Enable streaming output ) - await tts_model.connect() - msg = Msg( name="assistant", - content="This is a streaming output example.", + content="Hello, this is DashScope TTS with streaming output.", role="assistant", ) - # Synthesize returns an async generator when stream=True - response_generator = await tts_model.synthesize(msg) - - if isinstance(response_generator, AsyncGenerator): - # Streaming mode - iterate over audio chunks - async for chunk in response_generator: - if chunk.content: - print( - f"Received audio chunk: {len(chunk.content)} blocks, is_last={chunk.is_last}", - ) - # Process audio chunk here (e.g., play audio) - else: - # Non-streaming mode - print(f"Received {len(response_generator.content)} audio blocks") - - await tts_model.close() - - -# asyncio.run(example_streaming_output()) - -# %% -# Context Manager Usage -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# TTS models support Python's async context manager protocol, which automatically handles -# connection and cleanup. This is especially useful for realtime TTS models: - - -async def example_context_manager() -> None: - """An example of using TTS models as context managers.""" - # For realtime TTS models, the context manager automatically calls connect() and close() - async with DashScopeRealtimeTTSModel( - api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen3-tts-flash-realtime", - voice="Cherry", - stream=False, # Set to False for simpler example - ) as tts_model: - msg = Msg( - name="assistant", - content="Using context manager for TTS.", - role="assistant", + # Synthesize and receive an async generator for streaming output + async for tts_response in await tts_model.synthesize(msg): + # Process each audio chunk as it arrives + print( + "Received audio chunk of length:", + len(tts_response.content[0]["source"]["data"]), ) - tts_response = await tts_model.synthesize(msg) - print(f"TTS Response: {tts_response}") - # Connection is automatically closed when exiting the context +asyncio.run(example_non_realtime_tts_streaming()) -# asyncio.run(example_context_manager()) # %% -# Configuration Options +# Realtime TTS # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Different TTS models support various configuration options: +# Realtime TTS models are designed for scenarios where text is generated +# incrementally, such as streaming LLM responses. This enables the lowest +# possible latency by starting audio synthesis before the complete text is ready. # -# **DashScope Realtime TTS:** +# **Key Concepts:** # -# - ``voice``: Voice selection (e.g., "Cherry", "Serena", "Ethan", "Chelsie") -# - ``mode``: TTS mode ("server_commit" or "commit") -# - ``cold_start_length``: Minimum text length (characters) before sending the first request -# - ``cold_start_words``: Minimum word count before sending the first request +# - **Stateful Processing**: Realtime TTS maintains state for a single streaming +# session, identified by ``msg.id``. Only one streaming session can be active +# at a time. +# - **Two Methods**: # -# **DashScope TTS:** +# - ``push(msg)``: Non-blocking method that submits text chunks and returns +# immediately. May return partial audio if available. +# - ``synthesize(msg)``: Blocking method that finalizes the session and returns +# all remaining audio. When ``stream=True``, it returns an async generator. # -# - ``voice``: Voice selection -# - ``language_type``: Language type (e.g., "Auto", "Chinese", "English") +# .. code-block:: python # -# **OpenAI TTS:** +# async def example_realtime_tts_streaming(): +# tts_model = DashScopeRealtimeTTSModel( +# api_key=os.environ.get("DASHSCOPE_API_KEY", ""), +# model_name="qwen3-tts-flash-realtime", +# voice="Cherry", +# stream=False, +# ) # -# - ``voice``: Voice selection (e.g., "alloy", "ash", "ballad", "coral") -# - ``model_name``: Model selection ("gpt-4o-mini-tts", "tts-1", "tts-1-hd") +# # realtime tts model received accumulative text chunks +# res = await tts_model.push(msg_chunk_1) # non-blocking +# res = await tts_model.push(msg_chunk_2) # non-blocking +# ... +# res = await tts_model.synthesize(final_msg) # blocking, get all remaining audio # -# **Gemini TTS:** +# When setting ``stream=True`` during initialization, the ``synthesize()`` method returns an async generator of ``TTSResponse`` objects, allowing you to process audio chunks as they arrive. # -# - ``voice``: Voice selection (e.g., "Zephyr", "Kore", "Orus", "Autonoe") -# - ``model_name``: Model selection (e.g., "gemini-2.5-flash-preview-tts") - - -async def example_configuration() -> None: - """An example showing different configuration options.""" - # DashScope Realtime TTS with custom configuration - tts_model = DashScopeRealtimeTTSModel( - api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen3-tts-flash-realtime", - voice="Serena", # Different voice - mode="server_commit", # Server manages text segmentation - cold_start_length=10, # Wait for 10 characters before sending - cold_start_words=3, # Or wait for 3 words - stream=False, # Set to False for simpler example - ) - - await tts_model.connect() - - msg = Msg( - name="assistant", - content="Custom configuration example.", - role="assistant", - ) - tts_response = await tts_model.synthesize(msg) - print(f"TTS Response with custom config: {tts_response}") - - await tts_model.close() - - -# asyncio.run(example_configuration()) - -# %% -# Handling TTS Responses +# +# Integrating with ReActAgent # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# TTS models return ``TTSResponse`` objects that contain ``AudioBlock`` instances. -# Each ``AudioBlock`` contains base64-encoded audio data that can be decoded and played: - - -async def example_handling_response() -> None: - """An example of handling TTS responses and audio data.""" - import base64 - - tts_model = DashScopeRealtimeTTSModel( - api_key=os.environ.get("DASHSCOPE_API_KEY", ""), - model_name="qwen3-tts-flash-realtime", - voice="Cherry", - stream=False, # Set to False for simpler example - ) +# AgentScope agents can automatically synthesize their responses to speech +# when provided with a TTS model. This works seamlessly with both realtime +# and non-realtime TTS models. +# +# **How It Works:** +# +# 1. The agent generates a text response (potentially streamed from an LLM) +# 2. The TTS model synthesizes the text to audio automatically +# 3. The synthesized audio is attached to the ``speech`` field of the ``Msg`` object +# 4. The audio is played during the agent's ``self.print()`` method +# - await tts_model.connect() - msg = Msg( - name="assistant", - content="This example shows how to handle TTS responses.", - role="assistant", +async def example_agent_with_tts() -> None: + """An example of using TTS with ReActAgent.""" + # Create an agent with TTS enabled + agent = ReActAgent( + name="Assistant", + sys_prompt="You are a helpful assistant.", + model=DashScopeChatModel( + api_key=os.environ.get("DASHSCOPE_API_KEY", ""), + model_name="qwen-max", + stream=True, + ), + formatter=DashScopeChatFormatter(), + # Enable TTS + tts_model=DashScopeRealtimeTTSModel( + api_key=os.getenv("DASHSCOPE_API_KEY"), + model_name="qwen3-tts-flash-realtime", + voice="Cherry", + ), ) + user = UserAgent("User") - tts_response = await tts_model.synthesize(msg) - - # Access audio blocks - for i, audio_block in enumerate(tts_response.content): - print(f"Audio block {i}:") - print(f" Type: {audio_block.type}") - print(f" Source type: {audio_block.source.type}") - print(f" Media type: {audio_block.source.media_type}") - print( - f" Data length: {len(audio_block.source.data)} characters (base64)", - ) - - # Decode base64 audio data if needed - # audio_bytes = base64.b64decode(audio_block.source.data) - # # Now you can save or play the audio - - # Access response metadata - print(f"Response ID: {tts_response.id}") - print(f"Created at: {tts_response.created_at}") - print(f"Is last: {tts_response.is_last}") - - await tts_model.close() - + # Build a conversation just like normal + msg = None + while True: + msg = await agent(msg) + msg = await user(msg) + if msg.get_text_content() == "exit": + break -# asyncio.run(example_handling_response()) # %% +# Customizing TTS Model +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# You can create custom TTS implementations by inheriting from ``TTSModelBase``. +# The base class provides a flexible interface for both realtime and non-realtime +# TTS models. +# We use an attribute ``supports_streaming_input`` to indicate if the TTS model is realtime or not. +# +# For realtime TTS models, you need to implement the ``connect``, ``close``, ``push`` and ``synthesize`` methods to handle the lifecycle and streaming input. +# +# While for non-realtime TTS models, you only need to implement the ``synthesize`` method. +# # Further Reading # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - :ref:`agent` - Learn more about agents in AgentScope diff --git a/docs/tutorial/zh_CN/index.rst b/docs/tutorial/zh_CN/index.rst index a9f53b16c1..704bcf4b67 100644 --- a/docs/tutorial/zh_CN/index.rst +++ b/docs/tutorial/zh_CN/index.rst @@ -31,28 +31,45 @@ Welcome to AgentScope's documentation! tutorial/faq + .. toctree:: :maxdepth: 1 - :caption: Task Guides + :caption: Model and Context tutorial/task_model tutorial/task_prompt - tutorial/task_tool + tutorial/task_token tutorial/task_memory tutorial/task_long_term_memory + +.. toctree:: + :maxdepth: 1 + :caption: Tool + + tutorial/task_tool + tutorial/task_mcp + tutorial/task_agent_skill + +.. toctree:: + :maxdepth: 1 + :caption: Agent + tutorial/task_agent + tutorial/task_state + tutorial/task_hook + +.. toctree:: + :maxdepth: 1 + :caption: Features + tutorial/task_pipeline tutorial/task_plan tutorial/task_rag - tutorial/task_state - tutorial/task_hook - tutorial/task_mcp - tutorial/task_agent_skill tutorial/task_studio tutorial/task_tracing tutorial/task_eval tutorial/task_embedding - tutorial/task_token + tutorial/task_tts .. toctree:: :maxdepth: 1 @@ -76,3 +93,4 @@ Welcome to AgentScope's documentation! api/agentscope.tracing api/agentscope.session api/agentscope.exception + api/agentscope.tts diff --git a/examples/functionality/tts/main.py b/examples/functionality/tts/main.py index 007e80e1a4..facf2bf08b 100644 --- a/examples/functionality/tts/main.py +++ b/examples/functionality/tts/main.py @@ -28,15 +28,19 @@ async def main() -> None: sys_prompt="You are a helpful assistant named Friday.", model=DashScopeChatModel( api_key=os.environ.get("DASHSCOPE_API_KEY"), - model_name="qwen-max", + model_name="qwen3-max", enable_thinking=False, stream=True, ), formatter=DashScopeChatFormatter(), toolkit=toolkit, memory=InMemoryMemory(), + # tts_model=DashScopeTTSModel( + # model_name="qwen3-tts-flash", + # api_key=os.environ.get("DASHSCOPE_API_KEY"), + # ), tts_model=DashScopeRealtimeTTSModel( - model_name="qwen-tts-realtime", + model_name="qwen3-tts-flash-realtime", api_key=os.environ.get("DASHSCOPE_API_KEY"), voice="Cherry", ), @@ -45,10 +49,10 @@ async def main() -> None: msg = None while True: + msg = await agent(msg) msg = await user(msg) if msg.get_text_content() == "exit": break - msg = await agent(msg) asyncio.run(main()) diff --git a/src/agentscope/agent/_agent_base.py b/src/agentscope/agent/_agent_base.py index 0ac03550d2..722297780d 100644 --- a/src/agentscope/agent/_agent_base.py +++ b/src/agentscope/agent/_agent_base.py @@ -215,11 +215,15 @@ async def print(self, msg: Msg, last: bool = True) -> None: # and the thinking blocks thinking_and_text_to_print = [] - for block in msg.get_content_blocks(): - if block["type"] == "audio": - self._process_audio_block(msg.id, block) + # Play audio block if exists + if isinstance(msg.speech, list): + for audio_block in msg.speech: + self._process_audio_block(msg.id, audio_block) + elif isinstance(msg.speech, dict): + self._process_audio_block(msg.id, msg.speech) - elif block["type"] == "text": + for block in msg.get_content_blocks(): + if block["type"] == "text": self._print_text_block( msg.id, name_prefix=msg.name, @@ -385,18 +389,28 @@ def _print_text_block( def _print_last_block( self, - block: ToolUseBlock | ToolResultBlock | ImageBlock | VideoBlock, + block: ToolUseBlock + | ToolResultBlock + | ImageBlock + | VideoBlock + | AudioBlock, msg: Msg, ) -> None: """Process and print the last content block, and the block type - is not audio, text, or thinking. + is not text, or thinking. Args: - block (`ToolUseBlock | ToolResultBlock | ImageBlock | VideoBlock`): + block (`ToolUseBlock | ToolResultBlock | ImageBlock | VideoBlock \ + | AudioBlock`): The content block to be printed msg (`Msg`): The message object """ + # TODO: We should consider how to handle the multimodal blocks in the + # terminal, since the base64 data may be too long to display. + if block.get("type") in ["image", "video", "audio"]: + return + text_prefix = self._stream_prefix.get(msg.id, {}).get("text", "") if text_prefix: diff --git a/src/agentscope/agent/_react_agent.py b/src/agentscope/agent/_react_agent.py index 83c7e1edd3..5a93e145ff 100644 --- a/src/agentscope/agent/_react_agent.py +++ b/src/agentscope/agent/_react_agent.py @@ -451,7 +451,10 @@ async def _reasoning( msg.speech = msg.get_content_blocks("audio") or None # Push to TTS model if available - if self.tts_model: + if ( + self.tts_model + and self.tts_model.supports_streaming_input + ): tts_res = await self.tts_model.push(msg) msg.speech = tts_res.content diff --git a/src/agentscope/tts/_dashscope_realtime_tts_model.py b/src/agentscope/tts/_dashscope_realtime_tts_model.py index 2f175ea98c..da9128aee3 100644 --- a/src/agentscope/tts/_dashscope_realtime_tts_model.py +++ b/src/agentscope/tts/_dashscope_realtime_tts_model.py @@ -309,7 +309,6 @@ async def close(self) -> None: self._connected = False - # TODO: 删去这里 self._tts_client.finish() self._tts_client.close() @@ -424,7 +423,7 @@ async def synthesize( else: # Record current message ID self._current_msg_id = msg.id - delta_to_send = msg.get_text_content().removeprefix( + delta_to_send = (msg.get_text_content() or "").removeprefix( self._current_prefix, ) diff --git a/src/agentscope/tts/_dashscope_tts_model.py b/src/agentscope/tts/_dashscope_tts_model.py index abc4c3e99c..a15847f43c 100644 --- a/src/agentscope/tts/_dashscope_tts_model.py +++ b/src/agentscope/tts/_dashscope_tts_model.py @@ -176,31 +176,3 @@ async def _parse_into_async_generator( ], is_last=True, ) - - async def push( - self, - msg: Msg, - **kwargs: Any, - ) -> TTSResponse: - """Append text to be synthesized and return the received TTS response. - - .. note:: - This method is not supported for DashScope TTS model as it does not - support streaming input (``supports_streaming_input=False``). - This method always returns an empty response. - - To synthesize speech, use the `synthesize` method instead. - - Args: - msg (`Msg`): - The message to be synthesized. The `msg.id` identifies the - streaming input request. - **kwargs (`Any`): - Additional keyword arguments to pass to the TTS API call. - - Returns: - `TTSResponse`: - Always returns an empty TTSResponse as streaming input is not - supported. - """ - return TTSResponse(content=[]) diff --git a/src/agentscope/tts/_gemini_tts_model.py b/src/agentscope/tts/_gemini_tts_model.py index 240cc82245..d511536981 100644 --- a/src/agentscope/tts/_gemini_tts_model.py +++ b/src/agentscope/tts/_gemini_tts_model.py @@ -208,31 +208,3 @@ async def _parse_into_async_generator( ], ) yield TTSResponse(content=[]) - - async def push( - self, - msg: Msg, - **kwargs: Any, - ) -> TTSResponse: - """Append text to be synthesized and return the received TTS response. - - .. note:: - This method is not supported for Gemini TTS model as it does not - support streaming input (``supports_streaming_input=False``). - This method always returns an empty response. - - To synthesize speech, use the `synthesize` method instead. - - Args: - msg (`Msg`): - The message to be synthesized. The `msg.id` identifies the - streaming input request. - **kwargs (`Any`): - Additional keyword arguments to pass to the TTS API call. - - Returns: - `TTSResponse`: - Always returns an empty TTSResponse as streaming input is not - supported. - """ - return TTSResponse(content=[]) diff --git a/src/agentscope/tts/_openai_tts_model.py b/src/agentscope/tts/_openai_tts_model.py index 341fbe912c..a9af7a0bda 100644 --- a/src/agentscope/tts/_openai_tts_model.py +++ b/src/agentscope/tts/_openai_tts_model.py @@ -119,8 +119,6 @@ async def synthesize( **kwargs, ) - # TODO: if we set `response_format` to "wav", do we still need - # decoding? audio_base64 = base64.b64encode(response.content).decode( "utf-8", ) @@ -189,31 +187,3 @@ async def _parse_into_async_generator( ], is_last=True, ) - - async def push( - self, - msg: Msg, - **kwargs: Any, - ) -> TTSResponse: - """Append text to be synthesized and return the received TTS response. - - .. note:: - This method is not supported for OpenAI TTS model as it does not - support streaming input (``supports_streaming_input=False``). - This method always returns an empty response. - - To synthesize speech, use the `synthesize` method instead. - - Args: - msg (`Msg`): - The message to be synthesized. The `msg.id` identifies the - streaming input request. - **kwargs (`Any`): - Additional keyword arguments to pass to the TTS API call. - - Returns: - `TTSResponse`: - Always returns an empty TTSResponse as streaming input is not - supported. - """ - return TTSResponse(content=[]) diff --git a/src/agentscope/tts/_tts_base.py b/src/agentscope/tts/_tts_base.py index 05d1fd9d51..64963f88c2 100644 --- a/src/agentscope/tts/_tts_base.py +++ b/src/agentscope/tts/_tts_base.py @@ -65,7 +65,8 @@ async def __aexit__( ) -> None: """Exit the async context manager and clean up resources if needed.""" if self.supports_streaming_input: - await self.close() + # await self.close() + pass async def connect(self) -> None: """Connect to the TTS model and initialize resources. For non-realtime