@@ -36,7 +36,8 @@ option objc_class_prefix = "GCS";
3636// Service that implements Google Cloud Speech API.
3737service Speech {
3838 option (google.api.default_host ) = "speech.googleapis.com" ;
39- option (google.api.oauth_scopes ) = "https://www.googleapis.com/auth/cloud-platform" ;
39+ option (google.api.oauth_scopes ) =
40+ "https://www.googleapis.com/auth/cloud-platform" ;
4041
4142 // Performs synchronous speech recognition: receive results after all audio
4243 // has been sent and processed.
@@ -54,7 +55,8 @@ service Speech {
5455 // a `LongRunningRecognizeResponse` message.
5556 // For more information on asynchronous speech recognition, see the
5657 // [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
57- rpc LongRunningRecognize (LongRunningRecognizeRequest ) returns (google.longrunning.Operation ) {
58+ rpc LongRunningRecognize (LongRunningRecognizeRequest )
59+ returns (google.longrunning.Operation ) {
5860 option (google.api.http ) = {
5961 post : "/v1/speech:longrunningrecognize"
6062 body : "*"
@@ -68,8 +70,8 @@ service Speech {
6870
6971 // Performs bidirectional streaming speech recognition: receive results while
7072 // sending audio. This method is only available via the gRPC API (not REST).
71- rpc StreamingRecognize (stream StreamingRecognizeRequest ) returns ( stream StreamingRecognizeResponse ) {
72- }
73+ rpc StreamingRecognize (stream StreamingRecognizeRequest )
74+ returns ( stream StreamingRecognizeResponse ) { }
7375}
7476
7577// The top-level message sent by the client for the `Recognize` method.
@@ -93,7 +95,8 @@ message LongRunningRecognizeRequest {
9395 RecognitionAudio audio = 2 [(google.api.field_behavior ) = REQUIRED ];
9496
9597 // Optional. Specifies an optional destination for the recognition results.
96- TranscriptOutputConfig output_config = 4 [(google.api.field_behavior ) = OPTIONAL ];
98+ TranscriptOutputConfig output_config = 4
99+ [(google.api.field_behavior ) = OPTIONAL ];
97100}
98101
99102// Specifies an optional destination for the recognition results.
@@ -134,6 +137,15 @@ message StreamingRecognizeRequest {
134137// Provides information to the recognizer that specifies how to process the
135138// request.
136139message StreamingRecognitionConfig {
140+ // Events that a timeout can be set on for voice activity.
141+ message VoiceActivityTimeout {
142+ // Duration to timeout the stream if no speech begins.
143+ google.protobuf.Duration speech_start_timeout = 1 ;
144+
145+ // Duration to timeout the stream after speech ends.
146+ google.protobuf.Duration speech_end_timeout = 2 ;
147+ }
148+
137149 // Required. Provides information to the recognizer that specifies how to
138150 // process the request.
139151 RecognitionConfig config = 1 [(google.api.field_behavior ) = REQUIRED ];
@@ -166,6 +178,15 @@ message StreamingRecognitionConfig {
166178 // the `is_final=false` flag).
167179 // If `false` or omitted, only `is_final=true` result(s) are returned.
168180 bool interim_results = 3 ;
181+
182+ // If `true`, responses with voice activity speech events will be returned as
183+ // they are detected.
184+ bool enable_voice_activity_events = 5 ;
185+
186+ // If set, the server will automatically close the stream after the specified
187+ // duration has elapsed after the last VOICE_ACTIVITY speech event has been
188+ // sent. The field `voice_activity_events` must also be set to true.
189+ VoiceActivityTimeout voice_activity_timeout = 6 ;
169190}
170191
171192// Provides information to the recognizer that specifies how to process the
@@ -193,7 +214,8 @@ message RecognitionConfig {
193214 // an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
194215 // encoding configuration must match the encoding described in the audio
195216 // header; otherwise the request returns an
196- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
217+ // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error
218+ // code.
197219 enum AudioEncoding {
198220 // Not specified.
199221 ENCODING_UNSPECIFIED = 0 ;
@@ -246,7 +268,8 @@ message RecognitionConfig {
246268
247269 // Encoding of audio data sent in all `RecognitionAudio` messages.
248270 // This field is optional for `FLAC` and `WAV` audio files and required
249- // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
271+ // for all other audio formats. For details, see
272+ // [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
250273 AudioEncoding encoding = 1 ;
251274
252275 // Sample rate in Hertz of the audio data sent in all
@@ -255,7 +278,8 @@ message RecognitionConfig {
255278 // source to 16000 Hz. If that's not possible, use the native sample rate of
256279 // the audio source (instead of re-sampling).
257280 // This field is optional for FLAC and WAV audio files, but is
258- // required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
281+ // required for all other audio formats. For details, see
282+ // [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
259283 int32 sample_rate_hertz = 2 ;
260284
261285 // The number of channels in the input audio data.
@@ -454,10 +478,8 @@ message SpeakerDiarizationConfig {
454478 int32 max_speaker_count = 3 ;
455479
456480 // Output only. Unused.
457- int32 speaker_tag = 5 [
458- deprecated = true ,
459- (google.api.field_behavior ) = OUTPUT_ONLY
460- ];
481+ int32 speaker_tag = 5
482+ [deprecated = true , (google.api.field_behavior ) = OUTPUT_ONLY ];
461483}
462484
463485// Description of audio data to be recognized.
@@ -619,8 +641,8 @@ message SpeechContext {
619641
620642// Contains audio data in the encoding specified in the `RecognitionConfig`.
621643// Either `content` or `uri` must be supplied. Supplying both or neither
622- // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
623- // [content limits](https://cloud.google.com/speech-to-text/quotas#content).
644+ // returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
645+ // See [content limits](https://cloud.google.com/speech-to-text/quotas#content).
624646message RecognitionAudio {
625647 // The audio source, which is either inline content or a Google Cloud
626648 // Storage uri.
@@ -635,8 +657,9 @@ message RecognitionAudio {
635657 // Currently, only Google Cloud Storage URIs are
636658 // supported, which must be specified in the following format:
637659 // `gs://bucket_name/object_name` (other URI formats return
638- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
639- // [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
660+ // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
661+ // For more information, see [Request
662+ // URIs](https://cloud.google.com/storage/docs/reference-uris).
640663 string uri = 2 ;
641664 }
642665}
@@ -701,8 +724,8 @@ message LongRunningRecognizeMetadata {
701724 // Time of the most recent processing update.
702725 google.protobuf.Timestamp last_update_time = 3 ;
703726
704- // Output only. The URI of the audio file being transcribed. Empty if the audio was sent
705- // as byte content.
727+ // Output only. The URI of the audio file being transcribed. Empty if the
728+ // audio was sent as byte content.
706729 string uri = 4 [(google.api.field_behavior ) = OUTPUT_ONLY ];
707730}
708731
@@ -769,6 +792,23 @@ message StreamingRecognizeResponse {
769792 // until the server closes the gRPC connection. This event is only sent if
770793 // `single_utterance` was set to `true`, and is not used otherwise.
771794 END_OF_SINGLE_UTTERANCE = 1 ;
795+
796+ // This event indicates that the server has detected the beginning of human
797+ // voice activity in the stream. This event can be returned multiple times
798+ // if speech starts and stops repeatedly throughout the stream. This event
799+ // is only sent if `voice_activity_events` is set to true.
800+ SPEECH_ACTIVITY_BEGIN = 2 ;
801+
802+ // This event indicates that the server has detected the end of human voice
803+ // activity in the stream. This event can be returned multiple times if
804+ // speech starts and stops repeatedly throughout the stream. This event is
805+ // only sent if `voice_activity_events` is set to true.
806+ SPEECH_ACTIVITY_END = 3 ;
807+
808+ // This event indicates that the user-set timeout for speech activity begin
809+ // or end has exceeded. Upon receiving this event, the client is expected to
810+ // send a half close. Further audio will not be processed.
811+ SPEECH_ACTIVITY_TIMEOUT = 4 ;
772812 }
773813
774814 // If set, returns a [google.rpc.Status][google.rpc.Status] message that
@@ -784,6 +824,9 @@ message StreamingRecognizeResponse {
784824 // Indicates the type of speech event.
785825 SpeechEventType speech_event_type = 4 ;
786826
827+ // Time offset between the beginning of the audio and event emission.
828+ google.protobuf.Duration speech_event_time = 8 ;
829+
787830 // When available, billed audio seconds for the stream.
788831 // Set only if this is the last response in the stream.
789832 google.protobuf.Duration total_billed_time = 5 ;
@@ -828,9 +871,9 @@ message StreamingRecognitionResult {
828871 // For audio_channel_count = N, its output values can range from '1' to 'N'.
829872 int32 channel_tag = 5 ;
830873
831- // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
832- // of the language in this result. This language code was detected to have
833- // the most likelihood of being spoken in the audio.
874+ // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
875+ // language tag of the language in this result. This language code was
876+ // detected to have the most likelihood of being spoken in the audio.
834877 string language_code = 6 [(google.api.field_behavior ) = OUTPUT_ONLY ];
835878}
836879
@@ -851,9 +894,9 @@ message SpeechRecognitionResult {
851894 // beginning of the audio.
852895 google.protobuf.Duration result_end_time = 4 ;
853896
854- // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
855- // of the language in this result. This language code was detected to have
856- // the most likelihood of being spoken in the audio.
897+ // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
898+ // language tag of the language in this result. This language code was
899+ // detected to have the most likelihood of being spoken in the audio.
857900 string language_code = 5 [(google.api.field_behavior ) = OUTPUT_ONLY ];
858901}
859902
0 commit comments