Skip to content

Commit c66276d

Browse files
feat: Voice Activity Detection: adding speech event time and speech event type (#4020)
* feat: Voice Activity Detection: adding speech event time and speech event type PiperOrigin-RevId: 511839326 Source-Link: googleapis/googleapis@f04b136 Source-Link: googleapis/googleapis-gen@2130aec Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiMjEzMGFlYzk4MTgxY2ZiYTNlMmJjOTVjOWYxNGYxM2QwNThhYzNlYiJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent a16b18c commit c66276d

File tree

8 files changed

+1133
-59
lines changed

8 files changed

+1133
-59
lines changed

packages/google-cloud-speech/protos/google/cloud/speech/v1/cloud_speech.proto

Lines changed: 67 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ option objc_class_prefix = "GCS";
3636
// Service that implements Google Cloud Speech API.
3737
service Speech {
3838
option (google.api.default_host) = "speech.googleapis.com";
39-
option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
39+
option (google.api.oauth_scopes) =
40+
"https://www.googleapis.com/auth/cloud-platform";
4041

4142
// Performs synchronous speech recognition: receive results after all audio
4243
// has been sent and processed.
@@ -54,7 +55,8 @@ service Speech {
5455
// a `LongRunningRecognizeResponse` message.
5556
// For more information on asynchronous speech recognition, see the
5657
// [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
57-
rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
58+
rpc LongRunningRecognize(LongRunningRecognizeRequest)
59+
returns (google.longrunning.Operation) {
5860
option (google.api.http) = {
5961
post: "/v1/speech:longrunningrecognize"
6062
body: "*"
@@ -68,8 +70,8 @@ service Speech {
6870

6971
// Performs bidirectional streaming speech recognition: receive results while
7072
// sending audio. This method is only available via the gRPC API (not REST).
71-
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
72-
}
73+
rpc StreamingRecognize(stream StreamingRecognizeRequest)
74+
returns (stream StreamingRecognizeResponse) {}
7375
}
7476

7577
// The top-level message sent by the client for the `Recognize` method.
@@ -93,7 +95,8 @@ message LongRunningRecognizeRequest {
9395
RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED];
9496

9597
// Optional. Specifies an optional destination for the recognition results.
96-
TranscriptOutputConfig output_config = 4 [(google.api.field_behavior) = OPTIONAL];
98+
TranscriptOutputConfig output_config = 4
99+
[(google.api.field_behavior) = OPTIONAL];
97100
}
98101

99102
// Specifies an optional destination for the recognition results.
@@ -134,6 +137,15 @@ message StreamingRecognizeRequest {
134137
// Provides information to the recognizer that specifies how to process the
135138
// request.
136139
message StreamingRecognitionConfig {
140+
// Events that a timeout can be set on for voice activity.
141+
message VoiceActivityTimeout {
142+
// Duration to timeout the stream if no speech begins.
143+
google.protobuf.Duration speech_start_timeout = 1;
144+
145+
// Duration to timeout the stream after speech ends.
146+
google.protobuf.Duration speech_end_timeout = 2;
147+
}
148+
137149
// Required. Provides information to the recognizer that specifies how to
138150
// process the request.
139151
RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED];
@@ -166,6 +178,15 @@ message StreamingRecognitionConfig {
166178
// the `is_final=false` flag).
167179
// If `false` or omitted, only `is_final=true` result(s) are returned.
168180
bool interim_results = 3;
181+
182+
// If `true`, responses with voice activity speech events will be returned as
183+
// they are detected.
184+
bool enable_voice_activity_events = 5;
185+
186+
// If set, the server will automatically close the stream after the specified
187+
// duration has elapsed after the last VOICE_ACTIVITY speech event has been
188+
// sent. The field `voice_activity_events` must also be set to true.
189+
VoiceActivityTimeout voice_activity_timeout = 6;
169190
}
170191

171192
// Provides information to the recognizer that specifies how to process the
@@ -193,7 +214,8 @@ message RecognitionConfig {
193214
// an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
194215
// encoding configuration must match the encoding described in the audio
195216
// header; otherwise the request returns an
196-
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
217+
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error
218+
// code.
197219
enum AudioEncoding {
198220
// Not specified.
199221
ENCODING_UNSPECIFIED = 0;
@@ -246,7 +268,8 @@ message RecognitionConfig {
246268

247269
// Encoding of audio data sent in all `RecognitionAudio` messages.
248270
// This field is optional for `FLAC` and `WAV` audio files and required
249-
// for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
271+
// for all other audio formats. For details, see
272+
// [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
250273
AudioEncoding encoding = 1;
251274

252275
// Sample rate in Hertz of the audio data sent in all
@@ -255,7 +278,8 @@ message RecognitionConfig {
255278
// source to 16000 Hz. If that's not possible, use the native sample rate of
256279
// the audio source (instead of re-sampling).
257280
// This field is optional for FLAC and WAV audio files, but is
258-
// required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
281+
// required for all other audio formats. For details, see
282+
// [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
259283
int32 sample_rate_hertz = 2;
260284

261285
// The number of channels in the input audio data.
@@ -454,10 +478,8 @@ message SpeakerDiarizationConfig {
454478
int32 max_speaker_count = 3;
455479

456480
// Output only. Unused.
457-
int32 speaker_tag = 5 [
458-
deprecated = true,
459-
(google.api.field_behavior) = OUTPUT_ONLY
460-
];
481+
int32 speaker_tag = 5
482+
[deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];
461483
}
462484

463485
// Description of audio data to be recognized.
@@ -619,8 +641,8 @@ message SpeechContext {
619641

620642
// Contains audio data in the encoding specified in the `RecognitionConfig`.
621643
// Either `content` or `uri` must be supplied. Supplying both or neither
622-
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
623-
// [content limits](https://cloud.google.com/speech-to-text/quotas#content).
644+
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
645+
// See [content limits](https://cloud.google.com/speech-to-text/quotas#content).
624646
message RecognitionAudio {
625647
// The audio source, which is either inline content or a Google Cloud
626648
// Storage uri.
@@ -635,8 +657,9 @@ message RecognitionAudio {
635657
// Currently, only Google Cloud Storage URIs are
636658
// supported, which must be specified in the following format:
637659
// `gs://bucket_name/object_name` (other URI formats return
638-
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
639-
// [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
660+
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
661+
// For more information, see [Request
662+
// URIs](https://cloud.google.com/storage/docs/reference-uris).
640663
string uri = 2;
641664
}
642665
}
@@ -701,8 +724,8 @@ message LongRunningRecognizeMetadata {
701724
// Time of the most recent processing update.
702725
google.protobuf.Timestamp last_update_time = 3;
703726

704-
// Output only. The URI of the audio file being transcribed. Empty if the audio was sent
705-
// as byte content.
727+
// Output only. The URI of the audio file being transcribed. Empty if the
728+
// audio was sent as byte content.
706729
string uri = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
707730
}
708731

@@ -769,6 +792,23 @@ message StreamingRecognizeResponse {
769792
// until the server closes the gRPC connection. This event is only sent if
770793
// `single_utterance` was set to `true`, and is not used otherwise.
771794
END_OF_SINGLE_UTTERANCE = 1;
795+
796+
// This event indicates that the server has detected the beginning of human
797+
// voice activity in the stream. This event can be returned multiple times
798+
// if speech starts and stops repeatedly throughout the stream. This event
799+
// is only sent if `voice_activity_events` is set to true.
800+
SPEECH_ACTIVITY_BEGIN = 2;
801+
802+
// This event indicates that the server has detected the end of human voice
803+
// activity in the stream. This event can be returned multiple times if
804+
// speech starts and stops repeatedly throughout the stream. This event is
805+
// only sent if `voice_activity_events` is set to true.
806+
SPEECH_ACTIVITY_END = 3;
807+
808+
// This event indicates that the user-set timeout for speech activity begin
809+
// or end has exceeded. Upon receiving this event, the client is expected to
810+
// send a half close. Further audio will not be processed.
811+
SPEECH_ACTIVITY_TIMEOUT = 4;
772812
}
773813

774814
// If set, returns a [google.rpc.Status][google.rpc.Status] message that
@@ -784,6 +824,9 @@ message StreamingRecognizeResponse {
784824
// Indicates the type of speech event.
785825
SpeechEventType speech_event_type = 4;
786826

827+
// Time offset between the beginning of the audio and event emission.
828+
google.protobuf.Duration speech_event_time = 8;
829+
787830
// When available, billed audio seconds for the stream.
788831
// Set only if this is the last response in the stream.
789832
google.protobuf.Duration total_billed_time = 5;
@@ -828,9 +871,9 @@ message StreamingRecognitionResult {
828871
// For audio_channel_count = N, its output values can range from '1' to 'N'.
829872
int32 channel_tag = 5;
830873

831-
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
832-
// of the language in this result. This language code was detected to have
833-
// the most likelihood of being spoken in the audio.
874+
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
875+
// language tag of the language in this result. This language code was
876+
// detected to have the most likelihood of being spoken in the audio.
834877
string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
835878
}
836879

@@ -851,9 +894,9 @@ message SpeechRecognitionResult {
851894
// beginning of the audio.
852895
google.protobuf.Duration result_end_time = 4;
853896

854-
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
855-
// of the language in this result. This language code was detected to have
856-
// the most likelihood of being spoken in the audio.
897+
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
898+
// language tag of the language in this result. This language code was
899+
// detected to have the most likelihood of being spoken in the audio.
857900
string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
858901
}
859902

0 commit comments

Comments
 (0)