Skip to content

Commit beb5cac

Browse files
feat: add ReplicatedVoiceConfig to VoiceConfig to enable Gemini TTS voice replication (#6899)
* feat: add `ReplicatedVoiceConfig` to `VoiceConfig` to enable Gemini TTS voice replication PiperOrigin-RevId: 833480721 Source-Link: googleapis/googleapis@136201b Source-Link: googleapis/googleapis-gen@3dea11e Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLWFpcGxhdGZvcm0vLk93bEJvdC55YW1sIiwiaCI6IjNkZWExMWU4ODk0MTBmMGZmN2M3OWQ3ODhkYWY0ZTcwMTZmYTEwYzcifQ== * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Santiago Quiroga <22756465+quirogas@users.noreply.github.com>
1 parent 493bdd6 commit beb5cac

File tree

4 files changed

+2290
-0
lines changed

4 files changed

+2290
-0
lines changed

packages/google-cloud-aiplatform/protos/google/cloud/aiplatform/v1/content.proto

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,69 @@ message VideoMetadata {
184184
[(google.api.field_behavior) = OPTIONAL];
185185
}
186186

187+
// Configuration for a prebuilt voice.
188+
message PrebuiltVoiceConfig {
189+
// The name of the prebuilt voice to use.
190+
optional string voice_name = 1;
191+
}
192+
193+
// The configuration for the replicated voice to use.
194+
message ReplicatedVoiceConfig {
195+
// Optional. The mimetype of the voice sample. Currently only
196+
// mime_type=audio/pcm is supported, which is raw mono 16-bit signed
197+
// little-endian pcm data, with 24k sampling rate.
198+
string mime_type = 1 [(google.api.field_behavior) = OPTIONAL];
199+
200+
// Optional. The sample of the custom voice.
201+
bytes voice_sample_audio = 2 [(google.api.field_behavior) = OPTIONAL];
202+
}
203+
204+
205+
// Configuration for a voice.
206+
message VoiceConfig {
207+
// The configuration for the speaker to use.
208+
oneof voice_config {
209+
// The configuration for a prebuilt voice.
210+
PrebuiltVoiceConfig prebuilt_voice_config = 1;
211+
212+
// Optional. The configuration for a replicated voice. This enables users to
213+
// replicate a voice from an audio sample.
214+
ReplicatedVoiceConfig replicated_voice_config = 3
215+
[(google.api.field_behavior) = OPTIONAL];
216+
}
217+
}
218+
219+
// Configuration for a single speaker in a multi-speaker setup.
220+
message SpeakerVoiceConfig {
221+
// Required. The name of the speaker. This should be the same as the speaker
222+
// name used in the prompt.
223+
string speaker = 1 [(google.api.field_behavior) = REQUIRED];
224+
225+
// Required. The configuration for the voice of this speaker.
226+
VoiceConfig voice_config = 2 [(google.api.field_behavior) = REQUIRED];
227+
}
228+
229+
// Configuration for a multi-speaker text-to-speech request.
230+
message MultiSpeakerVoiceConfig {
231+
// Required. A list of configurations for the voices of the speakers. Exactly
232+
// two speaker voice configurations must be provided.
233+
repeated SpeakerVoiceConfig speaker_voice_configs = 2
234+
[(google.api.field_behavior) = REQUIRED];
235+
}
236+
237+
// Configuration for speech generation.
238+
message SpeechConfig {
239+
// The configuration for the voice to use.
240+
VoiceConfig voice_config = 1;
241+
242+
// Optional. The language code (ISO 639-1) for the speech synthesis.
243+
string language_code = 2 [(google.api.field_behavior) = OPTIONAL];
244+
245+
// The configuration for a multi-speaker text-to-speech request.
246+
// This field is mutually exclusive with `voice_config`.
247+
MultiSpeakerVoiceConfig multi_speaker_voice_config = 3;
248+
}
249+
187250
// Config for image generation features.
188251
message ImageConfig {
189252
// Optional. The desired aspect ratio for the generated images. The following
@@ -347,6 +410,10 @@ message GenerationConfig {
347410
optional RoutingConfig routing_config = 17
348411
[(google.api.field_behavior) = OPTIONAL];
349412

413+
// Optional. The speech generation config.
414+
optional SpeechConfig speech_config = 23
415+
[(google.api.field_behavior) = OPTIONAL];
416+
350417
// Optional. Config for thinking features.
351418
// An error will be returned if this field is set for models that don't
352419
// support thinking.

0 commit comments

Comments
 (0)