diff --git a/docs/tutorial/en/index.rst b/docs/tutorial/en/index.rst
index 1685353365..e5ecb2684a 100644
--- a/docs/tutorial/en/index.rst
+++ b/docs/tutorial/en/index.rst
@@ -33,26 +33,41 @@ Welcome to AgentScope's documentation!
 
 .. toctree::
    :maxdepth: 1
-   :caption: Task Guides
+   :caption: Model and Context
 
    tutorial/task_model
    tutorial/task_prompt
-   tutorial/task_tool
+   tutorial/task_token
    tutorial/task_memory
    tutorial/task_long_term_memory
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Tool
+
+   tutorial/task_tool
+   tutorial/task_mcp
+   tutorial/task_agent_skill
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Agent
+
    tutorial/task_agent
+   tutorial/task_state
+   tutorial/task_hook
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Features
+
    tutorial/task_pipeline
    tutorial/task_plan
    tutorial/task_rag
-   tutorial/task_state
-   tutorial/task_hook
-   tutorial/task_mcp
-   tutorial/task_agent_skill
    tutorial/task_studio
    tutorial/task_tracing
    tutorial/task_eval
    tutorial/task_embedding
-   tutorial/task_token
    tutorial/task_tts
 
 .. toctree::
@@ -77,3 +92,4 @@ Welcome to AgentScope's documentation!
    api/agentscope.tracing
    api/agentscope.session
    api/agentscope.exception
+   api/agentscope.tts
diff --git a/docs/tutorial/en/src/task_tts.py b/docs/tutorial/en/src/task_tts.py
index 7665f025d9..ced1f3aefa 100644
--- a/docs/tutorial/en/src/task_tts.py
+++ b/docs/tutorial/en/src/task_tts.py
@@ -5,496 +5,236 @@
 TTS
 ====================
 
-AgentScope provides a unified TTS (Text-to-Speech) module that supports multiple TTS providers,
-enabling agents to convert text responses into audio output. This tutorial demonstrates how to use
-TTS models in AgentScope.
+AgentScope provides a unified interface for Text-to-Speech (TTS) models across multiple API provides.
+This tutorial demonstrates how to use TTS models in AgentScope.
 
-The supported TTS providers include:
+AgentScope supports the following TTS APIs:
 
-.. list-table::
+.. list-table:: Built-in TTS Models
     :header-rows: 1
 
-    * - Provider
+    * - API
       - Class
       - Streaming Input
-    * - DashScope Realtime
+      - Non-Streaming Input
+      - Streaming Output
+      - Non-Streaming Output
+    * - DashScope Realtime API
       - ``DashScopeRealtimeTTSModel``
       - ✅
-    * - DashScope
+      - ✅
+      - ✅
+      - ✅
+    * - DashScope API
       - ``DashScopeTTSModel``
       - ❌
-    * - OpenAI
+      - ✅
+      - ✅
+      - ✅
+    * - OpenAI API
       - ``OpenAITTSModel``
       - ❌
-    * - Gemini
+      - ✅
+      - ✅
+      - ✅
+    * - Gemini API
       - ``GeminiTTSModel``
       - ❌
+      - ✅
+      - ✅
+      - ✅
 
-All TTS models inherit from ``TTSModelBase`` and provide a unified interface:
-
-- For **realtime TTS models** (supporting streaming input):
-
-  - ``connect()``: Establish connection to the TTS service
-
-  - ``push(msg)``: Append text chunks incrementally (non-blocking)
-
-  - ``synthesize(msg=None)``: Synthesize speech and block until complete
-
-  - ``close()``: Close the connection and clean up resources
+.. note:: The streaming input and output in AgentScope TTS models are all accumulative.
 
-- For **non-realtime TTS models**:
+**Choosing the Right Model:**
 
-  - ``synthesize(msg)``: Synthesize speech from complete text
+- **Use Non-Realtime TTS** when you have complete text ready (e.g., pre-written
+  responses, complete LLM outputs)
+- **Use Realtime TTS** when text is generated progressively (e.g., streaming
+  LLM responses) for lower latency
 
-The TTS models return ``TTSResponse`` objects containing ``AudioBlock`` instances with base64-encoded audio data.
 """
 
-# %%
-# Basic Usage - Realtime TTS Models
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# For realtime TTS models (like ``DashScopeRealtimeTTSModel``), you need to:
-#
-# 1. Initialize the TTS model with appropriate parameters
-# 2. Connect to the TTS service using ``connect()``
-# 3. Use ``synthesize()`` to synthesize complete text, or ``push()`` for incremental text
-# 4. Close the connection using ``close()``
-#
-# Let's start with a simple example using DashScope Realtime TTS:
-
 import asyncio
 import os
-from typing import AsyncGenerator
 
+from agentscope.agent import ReActAgent, UserAgent
+from agentscope.formatter import DashScopeChatFormatter
 from agentscope.message import Msg
+from agentscope.model import DashScopeChatModel
 from agentscope.tts import (
     DashScopeRealtimeTTSModel,
     DashScopeTTSModel,
-    OpenAITTSModel,
-    GeminiTTSModel,
-    TTSResponse,
 )
 
-
-async def example_basic_realtime_tts() -> None:
-    """A basic example of using DashScope Realtime TTS."""
-    # Initialize the TTS model
-    tts_model = DashScopeRealtimeTTSModel(
-        api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-        model_name="qwen3-tts-flash-realtime",
-        voice="Cherry",
-        stream=False,  # Set to False for simpler example
-    )
-
-    # Connect to the TTS service
-    await tts_model.connect()
-
-    # Create a message with text content
-    msg = Msg(
-        name="assistant",
-        content="Hello, this is a test of TTS functionality.",
-        role="assistant",
-    )
-
-    # Synthesize the text (blocking until complete)
-    tts_response = await tts_model.synthesize(msg)
-
-    # The response contains audio blocks
-    print(f"TTS Response: {tts_response}")
-    print(f"Number of audio blocks: {len(tts_response.content)}")
-
-    # Clean up
-    await tts_model.close()
-
-
-# asyncio.run(example_basic_realtime_tts())
-
 # %%
-# Basic Usage - Non-Realtime TTS Models
+# Non-Realtime TTS
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# For non-realtime TTS models (like ``DashScopeTTSModel``, ``OpenAITTSModel``, ``GeminiTTSModel``),
-# you can directly call ``synthesize()`` without needing to connect first:
+# Non-realtime TTS models process complete text inputs and are the simplest
+# to use. You can directly call their ``synthesize()`` method.
+#
+# Taking DashScope TTS model as an example:
 
 
-async def example_basic_non_realtime_tts() -> None:
+async def example_non_realtime_tts() -> None:
     """A basic example of using non-realtime TTS models."""
     # Example with DashScope TTS
-    if os.environ.get("DASHSCOPE_API_KEY"):
-        tts_model = DashScopeTTSModel(
-            api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-            model_name="qwen3-tts-flash",
-            voice="Cherry",
-        )
-
-        msg = Msg(
-            name="assistant",
-            content="Hello, this is DashScope TTS.",
-            role="assistant",
-        )
-
-        # Directly synthesize without connecting
-        tts_response = await tts_model.synthesize(msg)
-
-        print(f"TTS Response: {tts_response}")
-        print(f"Audio blocks: {len(tts_response.content)}")
-
-    # Example with OpenAI TTS
-    if os.environ.get("OPENAI_API_KEY"):
-        tts_model = OpenAITTSModel(
-            api_key=os.environ.get("OPENAI_API_KEY", ""),
-            model_name="gpt-4o-mini-tts",
-            voice="alloy",
-        )
-
-        msg = Msg(
-            name="assistant",
-            content="Hello, this is OpenAI TTS.",
-            role="assistant",
-        )
-
-        tts_response = await tts_model.synthesize(msg)
-
-        print(f"TTS Response: {tts_response}")
-        print(f"Audio blocks: {len(tts_response.content)}")
-
-    # Example with Gemini TTS
-    if os.environ.get("GEMINI_API_KEY"):
-        tts_model = GeminiTTSModel(
-            api_key=os.environ.get("GEMINI_API_KEY", ""),
-            model_name="gemini-2.5-flash-preview-tts",
-            voice="Kore",
-        )
-
-        msg = Msg(
-            name="assistant",
-            content="Hello, this is Gemini TTS.",
-            role="assistant",
-        )
-
-        tts_response = await tts_model.synthesize(msg)
-
-        print(f"TTS Response: {tts_response}")
-        print(f"Audio blocks: {len(tts_response.content)}")
-
-
-# asyncio.run(example_basic_non_realtime_tts())
-
-# %%
-# Using TTS with Agents
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# The most common use case is integrating TTS with agents. AgentScope's ``ReActAgent``
-# supports TTS models through the ``tts_model`` parameter. When a TTS model is provided,
-# the agent will automatically synthesize its text responses into audio.
-#
-# .. note:: The TTS model will be called automatically during agent execution, handling
-#           streaming text incrementally for models that support streaming input.
-
-
-async def example_agent_with_tts() -> None:
-    """An example of using TTS with ReActAgent."""
-    from agentscope.agent import ReActAgent, UserAgent
-    from agentscope.formatter import DashScopeChatFormatter
-    from agentscope.memory import InMemoryMemory
-    from agentscope.model import DashScopeChatModel
-
-    # Create a TTS model
-    tts_model = DashScopeRealtimeTTSModel(
-        api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-        model_name="qwen3-tts-flash-realtime",
-        voice="Cherry",
-    )
-
-    # Create an agent with TTS enabled
-    agent = ReActAgent(
-        name="Assistant",
-        sys_prompt="You are a helpful assistant.",
-        model=DashScopeChatModel(
-            api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-            model_name="qwen-max",
-            stream=True,
-        ),
-        formatter=DashScopeChatFormatter(),
-        memory=InMemoryMemory(),
-        tts_model=tts_model,  # Enable TTS
-    )
-
-    user = UserAgent("User")
-
-    # The agent will automatically synthesize its responses
-    msg = await user("Tell me a short story.")
-    response = await agent(msg)
-
-    print(f"Agent response: {response.get_text_content()}")
-
-    # Clean up
-    await tts_model.close()
-
-
-# asyncio.run(example_agent_with_tts())
-
-# %%
-# Streaming Input with Push and Synthesize
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# For realtime TTS models that support streaming input (like ``DashScopeRealtimeTTSModel``),
-# you can use ``push()`` to incrementally send text chunks as they arrive, and then
-# call ``synthesize()`` to get the final audio output.
-#
-# - ``push(msg)``: Non-blocking method that appends text and returns any available audio
-# - ``synthesize(msg=None)``: Blocking method that waits for all audio to be synthesized
-#
-# .. note:: The ``push()`` method uses the message ID (``msg.id``) to track streaming
-#           input requests. All chunks for the same message must have the same ID.
-
-
-async def example_streaming_push_synthesize() -> None:
-    """An example of using push() and synthesize() for streaming input."""
-    tts_model = DashScopeRealtimeTTSModel(
+    tts_model = DashScopeTTSModel(
         api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-        model_name="qwen3-tts-flash-realtime",
+        model_name="qwen3-tts-flash",
         voice="Cherry",
-        stream=False,  # Set to False for simpler example
+        stream=False,  # Non-streaming output
     )
 
-    await tts_model.connect()
-
-    # Simulate streaming text generation
-    text_chunks = [
-        "Hello, ",
-        "this is ",
-        "a streaming ",
-        "TTS example.",
-    ]
-
-    # Create a message with a consistent ID for all chunks
-    msg_id = "streaming_msg_001"
-    accumulated_text = ""
-
-    for i, chunk in enumerate(text_chunks):
-        # Accumulate text incrementally
-        accumulated_text += chunk
-
-        # Create a message with accumulated text and same ID
-        msg = Msg(
-            name="assistant",
-            content=accumulated_text,
-            role="assistant",
-        )
-        msg.id = msg_id  # Important: same ID for all chunks
-
-        # Push the incremental text (non-blocking)
-        tts_response = await tts_model.push(msg)
-        if tts_response.content:
-            print(
-                f"Chunk {i+1}: Received {len(tts_response.content)} audio blocks",
-            )
-
-    # Finalize synthesis to get all remaining audio
-    final_msg = Msg(
+    msg = Msg(
         name="assistant",
-        content=accumulated_text,
+        content="Hello, this is DashScope TTS.",
         role="assistant",
     )
-    final_msg.id = msg_id
 
-    final_response = await tts_model.synthesize(final_msg)
-    # Handle both TTSResponse and AsyncGenerator cases
-    if isinstance(final_response, AsyncGenerator):
-        async for chunk in final_response:
-            if chunk.content:
-                print(
-                    f"Final synthesis chunk: {len(chunk.content)} audio blocks",
-                )
-    else:
-        print(f"Final synthesis: {len(final_response.content)} audio blocks")
+    # Directly synthesize without connecting
+    tts_response = await tts_model.synthesize(msg)
 
-    await tts_model.close()
+    # tts_response.content contains an audio block with base64-encoded audio data
+    print(
+        "The length of audio data:",
+        len(tts_response.content[0]["source"]["data"]),
+    )
 
 
-# asyncio.run(example_streaming_push_synthesize())
+asyncio.run(example_non_realtime_tts())
 
 # %%
-# Streaming Output Mode
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# TTS models support streaming output mode, where audio is returned as an async generator
-# of ``TTSResponse`` objects. This is useful for real-time audio playback.
+# **Streaming Output for Lower Latency:**
+#
+# When ``stream=True``, the model returns audio chunks progressively, allowing
+# you to start playback before synthesis completes. This reduces perceived latency.
 #
-# Set ``stream=True`` when initializing the TTS model to enable streaming output:
 
 
-async def example_streaming_output() -> None:
-    """An example of using streaming output mode."""
-    tts_model = DashScopeRealtimeTTSModel(
+async def example_non_realtime_tts_streaming() -> None:
+    """An example of using non-realtime TTS models with streaming output."""
+    # Example with DashScope TTS with streaming output
+    tts_model = DashScopeTTSModel(
         api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-        model_name="qwen3-tts-flash-realtime",
+        model_name="qwen3-tts-flash",
         voice="Cherry",
         stream=True,  # Enable streaming output
     )
 
-    await tts_model.connect()
-
     msg = Msg(
         name="assistant",
-        content="This is a streaming output example.",
+        content="Hello, this is DashScope TTS with streaming output.",
         role="assistant",
     )
 
-    # Synthesize returns an async generator when stream=True
-    response_generator = await tts_model.synthesize(msg)
-
-    if isinstance(response_generator, AsyncGenerator):
-        # Streaming mode - iterate over audio chunks
-        async for chunk in response_generator:
-            if chunk.content:
-                print(
-                    f"Received audio chunk: {len(chunk.content)} blocks, is_last={chunk.is_last}",
-                )
-                # Process audio chunk here (e.g., play audio)
-    else:
-        # Non-streaming mode
-        print(f"Received {len(response_generator.content)} audio blocks")
-
-    await tts_model.close()
-
-
-# asyncio.run(example_streaming_output())
-
-# %%
-# Context Manager Usage
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# TTS models support Python's async context manager protocol, which automatically handles
-# connection and cleanup. This is especially useful for realtime TTS models:
-
-
-async def example_context_manager() -> None:
-    """An example of using TTS models as context managers."""
-    # For realtime TTS models, the context manager automatically calls connect() and close()
-    async with DashScopeRealtimeTTSModel(
-        api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-        model_name="qwen3-tts-flash-realtime",
-        voice="Cherry",
-        stream=False,  # Set to False for simpler example
-    ) as tts_model:
-        msg = Msg(
-            name="assistant",
-            content="Using context manager for TTS.",
-            role="assistant",
+    # Synthesize and receive an async generator for streaming output
+    async for tts_response in await tts_model.synthesize(msg):
+        # Process each audio chunk as it arrives
+        print(
+            "Received audio chunk of length:",
+            len(tts_response.content[0]["source"]["data"]),
         )
-        tts_response = await tts_model.synthesize(msg)
-        print(f"TTS Response: {tts_response}")
 
-    # Connection is automatically closed when exiting the context
 
+asyncio.run(example_non_realtime_tts_streaming())
 
-# asyncio.run(example_context_manager())
 
 # %%
-# Configuration Options
+# Realtime TTS
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Different TTS models support various configuration options:
+# Realtime TTS models are designed for scenarios where text is generated
+# incrementally, such as streaming LLM responses. This enables the lowest
+# possible latency by starting audio synthesis before the complete text is ready.
 #
-# **DashScope Realtime TTS:**
+# **Key Concepts:**
 #
-# - ``voice``: Voice selection (e.g., "Cherry", "Serena", "Ethan", "Chelsie")
-# - ``mode``: TTS mode ("server_commit" or "commit")
-# - ``cold_start_length``: Minimum text length (characters) before sending the first request
-# - ``cold_start_words``: Minimum word count before sending the first request
+# - **Stateful Processing**: Realtime TTS maintains state for a single streaming
+#   session, identified by ``msg.id``. Only one streaming session can be active
+#   at a time.
+# - **Two Methods**:
 #
-# **DashScope TTS:**
+#   - ``push(msg)``: Non-blocking method that submits text chunks and returns
+#     immediately. May return partial audio if available.
+#   - ``synthesize(msg)``: Blocking method that finalizes the session and returns
+#     all remaining audio. When ``stream=True``, it returns an async generator.
 #
-# - ``voice``: Voice selection
-# - ``language_type``: Language type (e.g., "Auto", "Chinese", "English")
+# .. code-block:: python
 #
-# **OpenAI TTS:**
+#     async def example_realtime_tts_streaming():
+#         tts_model = DashScopeRealtimeTTSModel(
+#             api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
+#             model_name="qwen3-tts-flash-realtime",
+#             voice="Cherry",
+#             stream=False,
+#         )
 #
-# - ``voice``: Voice selection (e.g., "alloy", "ash", "ballad", "coral")
-# - ``model_name``: Model selection ("gpt-4o-mini-tts", "tts-1", "tts-1-hd")
+#         # realtime tts model received accumulative text chunks
+#         res = await tts_model.push(msg_chunk_1)  # non-blocking
+#         res = await tts_model.push(msg_chunk_2)  # non-blocking
+#         ...
+#         res = await tts_model.synthesize(final_msg)  # blocking, get all remaining audio
 #
-# **Gemini TTS:**
+# When setting ``stream=True`` during initialization, the ``synthesize()`` method returns an async generator of ``TTSResponse`` objects, allowing you to process audio chunks as they arrive.
 #
-# - ``voice``: Voice selection (e.g., "Zephyr", "Kore", "Orus", "Autonoe")
-# - ``model_name``: Model selection (e.g., "gemini-2.5-flash-preview-tts")
-
-
-async def example_configuration() -> None:
-    """An example showing different configuration options."""
-    # DashScope Realtime TTS with custom configuration
-    tts_model = DashScopeRealtimeTTSModel(
-        api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-        model_name="qwen3-tts-flash-realtime",
-        voice="Serena",  # Different voice
-        mode="server_commit",  # Server manages text segmentation
-        cold_start_length=10,  # Wait for 10 characters before sending
-        cold_start_words=3,  # Or wait for 3 words
-        stream=False,  # Set to False for simpler example
-    )
-
-    await tts_model.connect()
-
-    msg = Msg(
-        name="assistant",
-        content="Custom configuration example.",
-        role="assistant",
-    )
-    tts_response = await tts_model.synthesize(msg)
-    print(f"TTS Response with custom config: {tts_response}")
-
-    await tts_model.close()
-
-
-# asyncio.run(example_configuration())
-
-# %%
-# Handling TTS Responses
+#
+# Integrating with ReActAgent
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# TTS models return ``TTSResponse`` objects that contain ``AudioBlock`` instances.
-# Each ``AudioBlock`` contains base64-encoded audio data that can be decoded and played:
-
-
-async def example_handling_response() -> None:
-    """An example of handling TTS responses and audio data."""
-    import base64
-
-    tts_model = DashScopeRealtimeTTSModel(
-        api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
-        model_name="qwen3-tts-flash-realtime",
-        voice="Cherry",
-        stream=False,  # Set to False for simpler example
-    )
+# AgentScope agents can automatically synthesize their responses to speech
+# when provided with a TTS model. This works seamlessly with both realtime
+# and non-realtime TTS models.
+#
+# **How It Works:**
+#
+# 1. The agent generates a text response (potentially streamed from an LLM)
+# 2. The TTS model synthesizes the text to audio automatically
+# 3. The synthesized audio is attached to the ``speech`` field of the ``Msg`` object
+# 4. The audio is played during the agent's ``self.print()`` method
+#
 
-    await tts_model.connect()
 
-    msg = Msg(
-        name="assistant",
-        content="This example shows how to handle TTS responses.",
-        role="assistant",
+async def example_agent_with_tts() -> None:
+    """An example of using TTS with ReActAgent."""
+    # Create an agent with TTS enabled
+    agent = ReActAgent(
+        name="Assistant",
+        sys_prompt="You are a helpful assistant.",
+        model=DashScopeChatModel(
+            api_key=os.environ.get("DASHSCOPE_API_KEY", ""),
+            model_name="qwen-max",
+            stream=True,
+        ),
+        formatter=DashScopeChatFormatter(),
+        # Enable TTS
+        tts_model=DashScopeRealtimeTTSModel(
+            api_key=os.getenv("DASHSCOPE_API_KEY"),
+            model_name="qwen3-tts-flash-realtime",
+            voice="Cherry",
+        ),
     )
+    user = UserAgent("User")
 
-    tts_response = await tts_model.synthesize(msg)
-
-    # Access audio blocks
-    for i, audio_block in enumerate(tts_response.content):
-        print(f"Audio block {i}:")
-        print(f"  Type: {audio_block.type}")
-        print(f"  Source type: {audio_block.source.type}")
-        print(f"  Media type: {audio_block.source.media_type}")
-        print(
-            f"  Data length: {len(audio_block.source.data)} characters (base64)",
-        )
-
-        # Decode base64 audio data if needed
-        # audio_bytes = base64.b64decode(audio_block.source.data)
-        # # Now you can save or play the audio
-
-    # Access response metadata
-    print(f"Response ID: {tts_response.id}")
-    print(f"Created at: {tts_response.created_at}")
-    print(f"Is last: {tts_response.is_last}")
-
-    await tts_model.close()
-
+    # Build a conversation just like normal
+    msg = None
+    while True:
+        msg = await agent(msg)
+        msg = await user(msg)
+        if msg.get_text_content() == "exit":
+            break
 
-# asyncio.run(example_handling_response())
 
 # %%
+# Customizing TTS Model
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# You can create custom TTS implementations by inheriting from ``TTSModelBase``.
+# The base class provides a flexible interface for both realtime and non-realtime
+# TTS models.
+# We use an attribute ``supports_streaming_input`` to indicate if the TTS model is realtime or not.
+#
+# For realtime TTS models, you need to implement the ``connect``, ``close``, ``push`` and ``synthesize`` methods to handle the lifecycle and streaming input.
+#
+# While for non-realtime TTS models, you only need to implement the ``synthesize`` method.
+#
 # Further Reading
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # - :ref:`agent` - Learn more about agents in AgentScope
diff --git a/docs/tutorial/zh_CN/index.rst b/docs/tutorial/zh_CN/index.rst
index a9f53b16c1..704bcf4b67 100644
--- a/docs/tutorial/zh_CN/index.rst
+++ b/docs/tutorial/zh_CN/index.rst
@@ -31,28 +31,45 @@ Welcome to AgentScope's documentation!
 
    tutorial/faq
 
+
 .. toctree::
    :maxdepth: 1
-   :caption: Task Guides
+   :caption: Model and Context
 
    tutorial/task_model
    tutorial/task_prompt
-   tutorial/task_tool
+   tutorial/task_token
    tutorial/task_memory
    tutorial/task_long_term_memory
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Tool
+
+   tutorial/task_tool
+   tutorial/task_mcp
+   tutorial/task_agent_skill
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Agent
+
    tutorial/task_agent
+   tutorial/task_state
+   tutorial/task_hook
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Features
+
    tutorial/task_pipeline
    tutorial/task_plan
    tutorial/task_rag
-   tutorial/task_state
-   tutorial/task_hook
-   tutorial/task_mcp
-   tutorial/task_agent_skill
    tutorial/task_studio
    tutorial/task_tracing
    tutorial/task_eval
    tutorial/task_embedding
-   tutorial/task_token
+   tutorial/task_tts
 
 .. toctree::
    :maxdepth: 1
@@ -76,3 +93,4 @@ Welcome to AgentScope's documentation!
    api/agentscope.tracing
    api/agentscope.session
    api/agentscope.exception
+   api/agentscope.tts
diff --git a/examples/functionality/tts/main.py b/examples/functionality/tts/main.py
index 007e80e1a4..facf2bf08b 100644
--- a/examples/functionality/tts/main.py
+++ b/examples/functionality/tts/main.py
@@ -28,15 +28,19 @@ async def main() -> None:
         sys_prompt="You are a helpful assistant named Friday.",
         model=DashScopeChatModel(
             api_key=os.environ.get("DASHSCOPE_API_KEY"),
-            model_name="qwen-max",
+            model_name="qwen3-max",
             enable_thinking=False,
             stream=True,
         ),
         formatter=DashScopeChatFormatter(),
         toolkit=toolkit,
         memory=InMemoryMemory(),
+        # tts_model=DashScopeTTSModel(
+        #     model_name="qwen3-tts-flash",
+        #     api_key=os.environ.get("DASHSCOPE_API_KEY"),
+        # ),
         tts_model=DashScopeRealtimeTTSModel(
-            model_name="qwen-tts-realtime",
+            model_name="qwen3-tts-flash-realtime",
             api_key=os.environ.get("DASHSCOPE_API_KEY"),
             voice="Cherry",
         ),
@@ -45,10 +49,10 @@ async def main() -> None:
 
     msg = None
     while True:
+        msg = await agent(msg)
         msg = await user(msg)
         if msg.get_text_content() == "exit":
             break
-        msg = await agent(msg)
 
 
 asyncio.run(main())
diff --git a/src/agentscope/agent/_agent_base.py b/src/agentscope/agent/_agent_base.py
index 0ac03550d2..722297780d 100644
--- a/src/agentscope/agent/_agent_base.py
+++ b/src/agentscope/agent/_agent_base.py
@@ -215,11 +215,15 @@ async def print(self, msg: Msg, last: bool = True) -> None:
         # and the thinking blocks
         thinking_and_text_to_print = []
 
-        for block in msg.get_content_blocks():
-            if block["type"] == "audio":
-                self._process_audio_block(msg.id, block)
+        # Play audio block if exists
+        if isinstance(msg.speech, list):
+            for audio_block in msg.speech:
+                self._process_audio_block(msg.id, audio_block)
+        elif isinstance(msg.speech, dict):
+            self._process_audio_block(msg.id, msg.speech)
 
-            elif block["type"] == "text":
+        for block in msg.get_content_blocks():
+            if block["type"] == "text":
                 self._print_text_block(
                     msg.id,
                     name_prefix=msg.name,
@@ -385,18 +389,28 @@ def _print_text_block(
 
     def _print_last_block(
         self,
-        block: ToolUseBlock | ToolResultBlock | ImageBlock | VideoBlock,
+        block: ToolUseBlock
+        | ToolResultBlock
+        | ImageBlock
+        | VideoBlock
+        | AudioBlock,
         msg: Msg,
     ) -> None:
         """Process and print the last content block, and the block type
-        is not audio, text, or thinking.
+        is not text, or thinking.
 
         Args:
-            block (`ToolUseBlock | ToolResultBlock | ImageBlock | VideoBlock`):
+            block (`ToolUseBlock | ToolResultBlock | ImageBlock | VideoBlock \
+            | AudioBlock`):
                 The content block to be printed
             msg (`Msg`):
                 The message object
         """
+        # TODO: We should consider how to handle the multimodal blocks in the
+        #  terminal, since the base64 data may be too long to display.
+        if block.get("type") in ["image", "video", "audio"]:
+            return
+
         text_prefix = self._stream_prefix.get(msg.id, {}).get("text", "")
 
         if text_prefix:
diff --git a/src/agentscope/agent/_react_agent.py b/src/agentscope/agent/_react_agent.py
index 83c7e1edd3..5a93e145ff 100644
--- a/src/agentscope/agent/_react_agent.py
+++ b/src/agentscope/agent/_react_agent.py
@@ -451,7 +451,10 @@ async def _reasoning(
                         msg.speech = msg.get_content_blocks("audio") or None
 
                         # Push to TTS model if available
-                        if self.tts_model:
+                        if (
+                            self.tts_model
+                            and self.tts_model.supports_streaming_input
+                        ):
                             tts_res = await self.tts_model.push(msg)
                             msg.speech = tts_res.content
 
diff --git a/src/agentscope/tts/_dashscope_realtime_tts_model.py b/src/agentscope/tts/_dashscope_realtime_tts_model.py
index 2f175ea98c..da9128aee3 100644
--- a/src/agentscope/tts/_dashscope_realtime_tts_model.py
+++ b/src/agentscope/tts/_dashscope_realtime_tts_model.py
@@ -309,7 +309,6 @@ async def close(self) -> None:
 
         self._connected = False
 
-        # TODO: 删去这里
         self._tts_client.finish()
         self._tts_client.close()
 
@@ -424,7 +423,7 @@ async def synthesize(
         else:
             # Record current message ID
             self._current_msg_id = msg.id
-            delta_to_send = msg.get_text_content().removeprefix(
+            delta_to_send = (msg.get_text_content() or "").removeprefix(
                 self._current_prefix,
             )
 
diff --git a/src/agentscope/tts/_dashscope_tts_model.py b/src/agentscope/tts/_dashscope_tts_model.py
index abc4c3e99c..a15847f43c 100644
--- a/src/agentscope/tts/_dashscope_tts_model.py
+++ b/src/agentscope/tts/_dashscope_tts_model.py
@@ -176,31 +176,3 @@ async def _parse_into_async_generator(
             ],
             is_last=True,
         )
-
-    async def push(
-        self,
-        msg: Msg,
-        **kwargs: Any,
-    ) -> TTSResponse:
-        """Append text to be synthesized and return the received TTS response.
-
-        .. note::
-            This method is not supported for DashScope TTS model as it does not
-            support streaming input (``supports_streaming_input=False``).
-            This method always returns an empty response.
-
-        To synthesize speech, use the `synthesize` method instead.
-
-        Args:
-            msg (`Msg`):
-                The message to be synthesized. The `msg.id` identifies the
-                streaming input request.
-            **kwargs (`Any`):
-                Additional keyword arguments to pass to the TTS API call.
-
-        Returns:
-            `TTSResponse`:
-                Always returns an empty TTSResponse as streaming input is not
-                supported.
-        """
-        return TTSResponse(content=[])
diff --git a/src/agentscope/tts/_gemini_tts_model.py b/src/agentscope/tts/_gemini_tts_model.py
index 240cc82245..d511536981 100644
--- a/src/agentscope/tts/_gemini_tts_model.py
+++ b/src/agentscope/tts/_gemini_tts_model.py
@@ -208,31 +208,3 @@ async def _parse_into_async_generator(
                 ],
             )
         yield TTSResponse(content=[])
-
-    async def push(
-        self,
-        msg: Msg,
-        **kwargs: Any,
-    ) -> TTSResponse:
-        """Append text to be synthesized and return the received TTS response.
-
-        .. note::
-            This method is not supported for Gemini TTS model as it does not
-            support streaming input (``supports_streaming_input=False``).
-            This method always returns an empty response.
-
-        To synthesize speech, use the `synthesize` method instead.
-
-        Args:
-            msg (`Msg`):
-                The message to be synthesized. The `msg.id` identifies the
-                streaming input request.
-            **kwargs (`Any`):
-                Additional keyword arguments to pass to the TTS API call.
-
-        Returns:
-            `TTSResponse`:
-                Always returns an empty TTSResponse as streaming input is not
-                supported.
-        """
-        return TTSResponse(content=[])
diff --git a/src/agentscope/tts/_openai_tts_model.py b/src/agentscope/tts/_openai_tts_model.py
index 341fbe912c..a9af7a0bda 100644
--- a/src/agentscope/tts/_openai_tts_model.py
+++ b/src/agentscope/tts/_openai_tts_model.py
@@ -119,8 +119,6 @@ async def synthesize(
                 **kwargs,
             )
 
-            # TODO: if we set `response_format` to "wav", do we still need
-            #  decoding?
             audio_base64 = base64.b64encode(response.content).decode(
                 "utf-8",
             )
@@ -189,31 +187,3 @@ async def _parse_into_async_generator(
                 ],
                 is_last=True,
             )
-
-    async def push(
-        self,
-        msg: Msg,
-        **kwargs: Any,
-    ) -> TTSResponse:
-        """Append text to be synthesized and return the received TTS response.
-
-        .. note::
-            This method is not supported for OpenAI TTS model as it does not
-            support streaming input (``supports_streaming_input=False``).
-            This method always returns an empty response.
-
-        To synthesize speech, use the `synthesize` method instead.
-
-        Args:
-            msg (`Msg`):
-                The message to be synthesized. The `msg.id` identifies the
-                streaming input request.
-            **kwargs (`Any`):
-                Additional keyword arguments to pass to the TTS API call.
-
-        Returns:
-            `TTSResponse`:
-                Always returns an empty TTSResponse as streaming input is not
-                supported.
-        """
-        return TTSResponse(content=[])
diff --git a/src/agentscope/tts/_tts_base.py b/src/agentscope/tts/_tts_base.py
index 05d1fd9d51..64963f88c2 100644
--- a/src/agentscope/tts/_tts_base.py
+++ b/src/agentscope/tts/_tts_base.py
@@ -65,7 +65,8 @@ async def __aexit__(
     ) -> None:
         """Exit the async context manager and clean up resources if needed."""
         if self.supports_streaming_input:
-            await self.close()
+            # await self.close()
+            pass
 
     async def connect(self) -> None:
         """Connect to the TTS model and initialize resources. For non-realtime