@@ -19,6 +19,7 @@ package google.cloud.speech.v1;
1919import "google/api/annotations.proto" ;
2020import "google/api/client.proto" ;
2121import "google/api/field_behavior.proto" ;
22+ import "google/cloud/speech/v1/resource.proto" ;
2223import "google/longrunning/operations.proto" ;
2324import "google/protobuf/any.proto" ;
2425import "google/protobuf/duration.proto" ;
@@ -181,7 +182,8 @@ message RecognitionConfig {
181182 // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
182183 // recognition can be reduced if lossy codecs are used to capture or transmit
183184 // audio, particularly if background noise is present. Lossy codecs include
184- // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`.
185+ // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`,
186+ // and `WEBM_OPUS`.
185187 //
186188 // The `FLAC` and `WAV` audio file formats include a header that describes the
187189 // included audio content. You can request recognition for `WAV` files that
@@ -236,6 +238,11 @@ message RecognitionConfig {
236238 // is replaced with a single byte containing the block length. Only Speex
237239 // wideband is supported. `sample_rate_hertz` must be 16000.
238240 SPEEX_WITH_HEADER_BYTE = 7 ;
241+
242+ // Opus encoded audio frames in WebM container
243+ // ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
244+ // one of 8000, 12000, 16000, 24000, or 48000.
245+ WEBM_OPUS = 9 ;
239246 }
240247
241248 // Encoding of audio data sent in all `RecognitionAudio` messages.
@@ -279,6 +286,20 @@ message RecognitionConfig {
279286 // of the currently supported language codes.
280287 string language_code = 3 [(google.api.field_behavior ) = REQUIRED ];
281288
289+ // A list of up to 3 additional
290+ // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
291+ // listing possible alternative languages of the supplied audio.
292+ // See [Language
293+ // Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
294+ // of the currently supported language codes. If alternative languages are
295+ // listed, recognition result will contain recognition in the most likely
296+ // language detected including the main language_code. The recognition result
297+ // will include the language tag of the language detected in the audio. Note:
298+ // This feature is only supported for Voice Command and Voice Search use cases
299+ // and performance may vary for other use cases (e.g., phone call
300+ // transcription).
301+ repeated string alternative_language_codes = 18 ;
302+
282303 // Maximum number of recognition hypotheses to be returned.
283304 // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
284305 // within each `SpeechRecognitionResult`.
@@ -293,6 +314,13 @@ message RecognitionConfig {
293314 // won't be filtered out.
294315 bool profanity_filter = 5 ;
295316
317+ // Speech adaptation configuration improves the accuracy of speech
318+ // recognition. For more information, see the [speech
319+ // adaptation](https://cloud.google.com/speech-to-text/docs/adaptation)
320+ // documentation.
321+ // When speech adaptation is set it supersedes the `speech_contexts` field.
322+ SpeechAdaptation adaptation = 20 ;
323+
296324 // Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
297325 // A means to provide context to assist the speech recognition. For more
298326 // information, see
@@ -306,12 +334,33 @@ message RecognitionConfig {
306334 // `false`.
307335 bool enable_word_time_offsets = 8 ;
308336
337+ // If `true`, the top result includes a list of words and the
338+ // confidence for those words. If `false`, no word-level confidence
339+ // information is returned. The default is `false`.
340+ bool enable_word_confidence = 15 ;
341+
309342 // If 'true', adds punctuation to recognition result hypotheses.
310343 // This feature is only available in select languages. Setting this for
311344 // requests in other languages has no effect at all.
312345 // The default 'false' value does not add punctuation to result hypotheses.
313346 bool enable_automatic_punctuation = 11 ;
314347
348+ // The spoken punctuation behavior for the call
349+ // If not set, uses default behavior based on model of choice
350+ // e.g. command_and_search will enable spoken punctuation by default
351+ // If 'true', replaces spoken punctuation with the corresponding symbols in
352+ // the request. For example, "how are you question mark" becomes "how are
353+ // you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation
354+ // for support. If 'false', spoken punctuation is not replaced.
355+ google.protobuf.BoolValue enable_spoken_punctuation = 22 ;
356+
357+ // The spoken emoji behavior for the call
358+ // If not set, uses default behavior based on model of choice
359+ // If 'true', adds spoken emoji formatting for the request. This will replace
360+ // spoken emojis with the corresponding Unicode symbols in the final
361+ // transcript. If 'false', spoken emojis are not replaced.
362+ google.protobuf.BoolValue enable_spoken_emojis = 23 ;
363+
315364 // Config to enable speaker diarization and set additional
316365 // parameters to make diarization better suited for your application.
317366 // Note: When this is enabled, we send all the words from the beginning of the
@@ -537,6 +586,16 @@ message SpeechContext {
537586 // improves the likelihood of correctly transcribing audio that includes
538587 // months.
539588 repeated string phrases = 1 ;
589+
590+ // Hint Boost. Positive value will increase the probability that a specific
591+ // phrase will be recognized over other similar sounding phrases. The higher
592+ // the boost, the higher the chance of false positive recognition as well.
593+ // Negative boost values would correspond to anti-biasing. Anti-biasing is not
594+ // enabled, so negative boost will simply be ignored. Though `boost` can
595+ // accept a wide range of positive values, most use cases are best served with
596+ // values between 0 and 20. We recommend using a binary search approach to
597+ // finding the optimal value for your use case.
598+ float boost = 4 ;
540599}
541600
542601// Contains audio data in the encoding specified in the `RecognitionConfig`.
@@ -587,6 +646,12 @@ message LongRunningRecognizeResponse {
587646
588647 // When available, billed audio seconds for the corresponding request.
589648 google.protobuf.Duration total_billed_time = 3 ;
649+
650+ // Original output config if present in the request.
651+ TranscriptOutputConfig output_config = 6 ;
652+
653+ // If the transcript output fails this field contains the relevant error.
654+ google.rpc.Status output_error = 7 ;
590655}
591656
592657// Describes the progress of a long-running `LongRunningRecognize` call. It is
@@ -723,11 +788,10 @@ message StreamingRecognitionResult {
723788 // For audio_channel_count = N, its output values can range from '1' to 'N'.
724789 int32 channel_tag = 5 ;
725790
726- // The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of
727- // the language in this result. This language code was detected to have the
728- // most likelihood of being spoken in the audio.
729- string language_code = 6
730- [(google.api.field_behavior ) = OUTPUT_ONLY ];
791+ // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
792+ // of the language in this result. This language code was detected to have
793+ // the most likelihood of being spoken in the audio.
794+ string language_code = 6 [(google.api.field_behavior ) = OUTPUT_ONLY ];
731795}
732796
733797// A speech recognition result corresponding to a portion of the audio.
@@ -742,6 +806,15 @@ message SpeechRecognitionResult {
742806 // recognized result for the audio from that channel.
743807 // For audio_channel_count = N, its output values can range from '1' to 'N'.
744808 int32 channel_tag = 2 ;
809+
810+ // Time offset of the end of this result relative to the
811+ // beginning of the audio.
812+ google.protobuf.Duration result_end_time = 4 ;
813+
814+ // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
815+ // of the language in this result. This language code was detected to have
816+ // the most likelihood of being spoken in the audio.
817+ string language_code = 5 [(google.api.field_behavior ) = OUTPUT_ONLY ];
745818}
746819
747820// Alternative hypotheses (a.k.a. n-best list).
@@ -785,6 +858,15 @@ message WordInfo {
785858 // The word corresponding to this set of information.
786859 string word = 3 ;
787860
861+ // The confidence estimate between 0.0 and 1.0. A higher number
862+ // indicates an estimated greater likelihood that the recognized words are
863+ // correct. This field is set only for the top alternative of a non-streaming
864+ // result or, of a streaming result where `is_final=true`.
865+ // This field is not guaranteed to be accurate and users should not rely on it
866+ // to be always provided.
867+ // The default of 0.0 is a sentinel value indicating `confidence` was not set.
868+ float confidence = 4 ;
869+
788870 // Output only. A distinct integer value is assigned for every speaker within
789871 // the audio. This field specifies which one of those speakers was detected to
790872 // have spoken this word. Value ranges from '1' to diarization_speaker_count.
0 commit comments