@@ -27,11 +27,15 @@ class VadOptions:
2727 min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
2828 max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
2929 than max_speech_duration_s will be split at the timestamp of the last silence that
30- lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
31- split aggressively just before max_speech_duration_s.
30+ lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
31+ Otherwise, they will be split aggressively just before max_speech_duration_s.
3232 min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
3333 before separating it
3434 speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
35+ min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
36+ when max_speech_duration_s is reached.
37+ use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
38+ max_speech_duration_s or not. If not, the last silence is used.
3539 """
3640
3741 threshold : float = 0.5
@@ -40,6 +44,8 @@ class VadOptions:
4044 max_speech_duration_s : float = float ("inf" )
4145 min_silence_duration_ms : int = 2000
4246 speech_pad_ms : int = 400
47+ min_silence_at_max_speech : int = 98
48+ use_max_poss_sil_at_max_speech : bool = True
4349
4450
4551def get_speech_timestamps (
@@ -69,6 +75,9 @@ def get_speech_timestamps(
6975 min_silence_duration_ms = vad_options .min_silence_duration_ms
7076 window_size_samples = 512
7177 speech_pad_ms = vad_options .speech_pad_ms
78+ min_silence_at_max_speech = vad_options .min_silence_at_max_speech
79+ use_max_poss_sil_at_max_speech = vad_options .use_max_poss_sil_at_max_speech
80+
7281 min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
7382 speech_pad_samples = sampling_rate * speech_pad_ms / 1000
7483 max_speech_samples = (
@@ -77,7 +86,7 @@ def get_speech_timestamps(
7786 - 2 * speech_pad_samples
7887 )
7988 min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
80- min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
89+ min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
8190
8291 audio_length_samples = len (audio )
8392
@@ -91,6 +100,8 @@ def get_speech_timestamps(
91100 triggered = False
92101 speeches = []
93102 current_speech = {}
103+ possible_ends = []
104+
94105 if neg_threshold is None :
95106 neg_threshold = max (threshold - 0.15 , 0.01 )
96107
@@ -100,45 +111,67 @@ def get_speech_timestamps(
100111 prev_end = next_start = 0
101112
102113 for i , speech_prob in enumerate (speech_probs ):
114+ cur_sample = window_size_samples * i
115+
103116 if (speech_prob >= threshold ) and temp_end :
117+ sil_dur = cur_sample - temp_end
118+ if sil_dur > min_silence_samples_at_max_speech :
119+ possible_ends .append ((temp_end , sil_dur ))
104120 temp_end = 0
105121 if next_start < prev_end :
106- next_start = window_size_samples * i
122+ next_start = cur_sample
107123
108124 if (speech_prob >= threshold ) and not triggered :
109125 triggered = True
110- current_speech ["start" ] = window_size_samples * i
126+ current_speech ["start" ] = cur_sample
111127 continue
112128
113- if (
114- triggered
115- and (window_size_samples * i ) - current_speech ["start" ] > max_speech_samples
116- ):
117- if prev_end :
129+ if triggered and (cur_sample - current_speech ["start" ] > max_speech_samples ):
130+ if use_max_poss_sil_at_max_speech and possible_ends :
131+ prev_end , dur = max (possible_ends , key = lambda x : x [1 ])
118132 current_speech ["end" ] = prev_end
119133 speeches .append (current_speech )
120134 current_speech = {}
121- # previously reached silence (< neg_thres) and is still not speech (< thres)
122- if next_start < prev_end :
123- triggered = False
124- else :
135+ next_start = prev_end + dur
136+
137+ if next_start < prev_end + cur_sample :
125138 current_speech ["start" ] = next_start
139+ else :
140+ triggered = False
126141 prev_end = next_start = temp_end = 0
142+ possible_ends = []
127143 else :
128- current_speech ["end" ] = window_size_samples * i
129- speeches .append (current_speech )
130- current_speech = {}
131- prev_end = next_start = temp_end = 0
132- triggered = False
133- continue
144+ if prev_end :
145+ current_speech ["end" ] = prev_end
146+ speeches .append (current_speech )
147+ current_speech = {}
148+ if next_start < prev_end :
149+ triggered = False
150+ else :
151+ current_speech ["start" ] = next_start
152+ prev_end = next_start = temp_end = 0
153+ possible_ends = []
154+ else :
155+ current_speech ["end" ] = cur_sample
156+ speeches .append (current_speech )
157+ current_speech = {}
158+ prev_end = next_start = temp_end = 0
159+ triggered = False
160+ possible_ends = []
161+ continue
134162
135163 if (speech_prob < neg_threshold ) and triggered :
136164 if not temp_end :
137- temp_end = window_size_samples * i
138- # condition to avoid cutting in very short silence
139- if (window_size_samples * i ) - temp_end > min_silence_samples_at_max_speech :
165+ temp_end = cur_sample
166+ sil_dur_now = cur_sample - temp_end
167+
168+ if (
169+ not use_max_poss_sil_at_max_speech
170+ and sil_dur_now > min_silence_samples_at_max_speech
171+ ):
140172 prev_end = temp_end
141- if (window_size_samples * i ) - temp_end < min_silence_samples :
173+
174+ if sil_dur_now < min_silence_samples :
142175 continue
143176 else :
144177 current_speech ["end" ] = temp_end
@@ -149,6 +182,7 @@ def get_speech_timestamps(
149182 current_speech = {}
150183 prev_end = next_start = temp_end = 0
151184 triggered = False
185+ possible_ends = []
152186 continue
153187
154188 if (
0 commit comments