Skip to content

Commit ed9a06c

Browse files
Adds new VAD parameters (#1386)
* Adds new VAD parameters Adds new VAD parameters: min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached. use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used. * Style * Update doc * change min_speech_duration_ms (0 -> 250) * Change min_speech_duration_ms to zero Set minimum speech duration to zero for flexibility. --------- Co-authored-by: Mahmoud Ashraf <[email protected]>
1 parent 2eeafe0 commit ed9a06c

File tree

1 file changed

+58
-24
lines changed

1 file changed

+58
-24
lines changed

faster_whisper/vad.py

Lines changed: 58 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,15 @@ class VadOptions:
2727
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
2828
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
2929
than max_speech_duration_s will be split at the timestamp of the last silence that
30-
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
31-
split aggressively just before max_speech_duration_s.
30+
lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
31+
Otherwise, they will be split aggressively just before max_speech_duration_s.
3232
min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
3333
before separating it
3434
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
35+
min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
36+
when max_speech_duration_s is reached.
37+
use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
38+
max_speech_duration_s or not. If not, the last silence is used.
3539
"""
3640

3741
threshold: float = 0.5
@@ -40,6 +44,8 @@ class VadOptions:
4044
max_speech_duration_s: float = float("inf")
4145
min_silence_duration_ms: int = 2000
4246
speech_pad_ms: int = 400
47+
min_silence_at_max_speech: int = 98
48+
use_max_poss_sil_at_max_speech: bool = True
4349

4450

4551
def get_speech_timestamps(
@@ -69,6 +75,9 @@ def get_speech_timestamps(
6975
min_silence_duration_ms = vad_options.min_silence_duration_ms
7076
window_size_samples = 512
7177
speech_pad_ms = vad_options.speech_pad_ms
78+
min_silence_at_max_speech = vad_options.min_silence_at_max_speech
79+
use_max_poss_sil_at_max_speech = vad_options.use_max_poss_sil_at_max_speech
80+
7281
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
7382
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
7483
max_speech_samples = (
@@ -77,7 +86,7 @@ def get_speech_timestamps(
7786
- 2 * speech_pad_samples
7887
)
7988
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
80-
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
89+
min_silence_samples_at_max_speech = sampling_rate * min_silence_at_max_speech / 1000
8190

8291
audio_length_samples = len(audio)
8392

@@ -91,6 +100,8 @@ def get_speech_timestamps(
91100
triggered = False
92101
speeches = []
93102
current_speech = {}
103+
possible_ends = []
104+
94105
if neg_threshold is None:
95106
neg_threshold = max(threshold - 0.15, 0.01)
96107

@@ -100,45 +111,67 @@ def get_speech_timestamps(
100111
prev_end = next_start = 0
101112

102113
for i, speech_prob in enumerate(speech_probs):
114+
cur_sample = window_size_samples * i
115+
103116
if (speech_prob >= threshold) and temp_end:
117+
sil_dur = cur_sample - temp_end
118+
if sil_dur > min_silence_samples_at_max_speech:
119+
possible_ends.append((temp_end, sil_dur))
104120
temp_end = 0
105121
if next_start < prev_end:
106-
next_start = window_size_samples * i
122+
next_start = cur_sample
107123

108124
if (speech_prob >= threshold) and not triggered:
109125
triggered = True
110-
current_speech["start"] = window_size_samples * i
126+
current_speech["start"] = cur_sample
111127
continue
112128

113-
if (
114-
triggered
115-
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
116-
):
117-
if prev_end:
129+
if triggered and (cur_sample - current_speech["start"] > max_speech_samples):
130+
if use_max_poss_sil_at_max_speech and possible_ends:
131+
prev_end, dur = max(possible_ends, key=lambda x: x[1])
118132
current_speech["end"] = prev_end
119133
speeches.append(current_speech)
120134
current_speech = {}
121-
# previously reached silence (< neg_thres) and is still not speech (< thres)
122-
if next_start < prev_end:
123-
triggered = False
124-
else:
135+
next_start = prev_end + dur
136+
137+
if next_start < prev_end + cur_sample:
125138
current_speech["start"] = next_start
139+
else:
140+
triggered = False
126141
prev_end = next_start = temp_end = 0
142+
possible_ends = []
127143
else:
128-
current_speech["end"] = window_size_samples * i
129-
speeches.append(current_speech)
130-
current_speech = {}
131-
prev_end = next_start = temp_end = 0
132-
triggered = False
133-
continue
144+
if prev_end:
145+
current_speech["end"] = prev_end
146+
speeches.append(current_speech)
147+
current_speech = {}
148+
if next_start < prev_end:
149+
triggered = False
150+
else:
151+
current_speech["start"] = next_start
152+
prev_end = next_start = temp_end = 0
153+
possible_ends = []
154+
else:
155+
current_speech["end"] = cur_sample
156+
speeches.append(current_speech)
157+
current_speech = {}
158+
prev_end = next_start = temp_end = 0
159+
triggered = False
160+
possible_ends = []
161+
continue
134162

135163
if (speech_prob < neg_threshold) and triggered:
136164
if not temp_end:
137-
temp_end = window_size_samples * i
138-
# condition to avoid cutting in very short silence
139-
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
165+
temp_end = cur_sample
166+
sil_dur_now = cur_sample - temp_end
167+
168+
if (
169+
not use_max_poss_sil_at_max_speech
170+
and sil_dur_now > min_silence_samples_at_max_speech
171+
):
140172
prev_end = temp_end
141-
if (window_size_samples * i) - temp_end < min_silence_samples:
173+
174+
if sil_dur_now < min_silence_samples:
142175
continue
143176
else:
144177
current_speech["end"] = temp_end
@@ -149,6 +182,7 @@ def get_speech_timestamps(
149182
current_speech = {}
150183
prev_end = next_start = temp_end = 0
151184
triggered = False
185+
possible_ends = []
152186
continue
153187

154188
if (

0 commit comments

Comments
 (0)