我试图使用ffmpeg缩短录音中的多余沉默(缩短它们,而不是完全消除沉默)。我使用的当前代码:
ffmpeg -hide_banner -i file_name.m4a -af silenceremove=0:0:0:-1:0.7:-30dB file_name_short.m4a
无效。它会检测到超过0.7秒的静音并将其完全移除,这不是我想要的。任何人都知道如何截断沉默,比如说,将沉默时间缩短1秒以下至0.5秒?
答案 0 :(得分:1)
ffmpeg 的silenceremove 命令的参数似乎只允许你删除超过一定长度的所有静音。这意味着如果您传入 stop_duration=0.5,并且有一个 2.2 秒长的静音块,那么您最终会剩下 0.2 秒的静音(2.2 - 0.5 - 0.5 - 0.5 - 0.5 = 0.2)。>
如果您不介意在 .wav 格式之间来回转换,您可以使用我编写的这个 Python 脚本。它有很多选项,尽管它是在 Python 中使用的,但它使用的是 NumPy,因此它可以在不到一秒的时间内处理短文件,并且可以在大约 5.7 秒内处理 2 小时长的 .wav,这很不错。为了提高速度,这可以用 C++ 重写。对于视频,可以使用 OpenCV 扩展解决方案。
优点:
它受它使用的模块的限制。捕获量是:
在您的情况下的用法:
ffmpeg -i myfile.m4a myfile.wav
python3 trim_silence.py --input=myfile.wav
ffmpeg -i result.wav -i myfile.m4a -map_metadata 1 myfile_trimmed.m4a
完整的使用说明:
usage: trim_silence.py [-h] --input INPUT [--output OUTPUT] [--threshold THRESHOLD] [--silence-dur SILENCE_DUR] [--non-silence-dur NON_SILENCE_DUR]
[--mode MODE] [--auto-threshold] [--auto-aggressiveness AUTO_AGGRESSIVENESS] [--detect-only] [--verbose] [--show-silence] [--time-it]
[--overwrite]
optional arguments:
-h, --help show this help message and exit
--input INPUT (REQUIRED) name of input wav file (default: None)
--output OUTPUT name of output wave file (default: result.wav)
--threshold THRESHOLD
silence threshold - can be expressed in dB, e.g. --threshold=-25.5dB (default: -25dB)
--silence-dur SILENCE_DUR
maximum silence duration desired in output (default: 0.5)
--non-silence-dur NON_SILENCE_DUR
minimum non-silence duration between periods of silence of at least --silence-dur length (default: 0.1)
--mode MODE silence detection mode - can be 'any' or 'all' (default: all)
--auto-threshold automatically determine silence threshold (default: False)
--auto-aggressiveness AUTO_AGGRESSIVENESS
aggressiveness of the auto-threshold algorithm. Integer between [-20,20] (default: 3)
--detect-only don't trim, just detect periods of silence (default: False)
--verbose print general information to the screen (default: False)
--show-silence print locations of silence (always true if --detect-only is used) (default: False)
--time-it show steps and time to complete them (default: False)
--overwrite overwrite existing output file, if applicable (default: False)
trim_silence.py 的内容:
import numpy as np
import argparse
import time
import sys
import os
def testmode(mode):
mode = mode.lower()
valid_modes = ["all","any"]
if mode not in valid_modes:
raise Exception("mode '{mode}' is not valid - must be one of {valid_modes}")
return mode
def testaggr(aggr):
try:
aggr = min(20,max(-20,int(aggr)))
return aggr
except:
raise Exception("auto-aggressiveness '{aggr}' is not valid - see usage")
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--input", type=str, help="(REQUIRED) name of input wav file", required=True)
parser.add_argument("--output", default="result.wav", type=str, help="name of output wave file")
parser.add_argument("--threshold", default="-25dB", type=str, help="silence threshold - can be expressed in dB, e.g. --threshold=-25.5dB")
parser.add_argument("--silence-dur", default=0.5, type=float, help="maximum silence duration desired in output")
parser.add_argument("--non-silence-dur", default=0.1, type=float, help="minimum non-silence duration between periods of silence of at least --silence-dur length")
parser.add_argument("--mode", default="all", type=testmode, help="silence detection mode - can be 'any' or 'all'")
parser.add_argument("--auto-threshold",action="store_true", help="automatically determine silence threshold")
parser.add_argument("--auto-aggressiveness",default=3,type=testaggr, help="aggressiveness of the auto-threshold algorithm. Integer between [-20,20]")
parser.add_argument("--detect-only", action="store_true", help="don't trim, just detect periods of silence")
parser.add_argument("--verbose", action="store_true", help="print general information to the screen")
parser.add_argument("--show-silence", action="store_true", help="print locations of silence (always true if --detect-only is used)")
parser.add_argument("--time-it", action="store_true", help="show steps and time to complete them")
parser.add_argument("--overwrite", action="store_true", help="overwrite existing output file, if applicable")
args = parser.parse_args()
args.show_silence = args.show_silence or args.detect_only
if not args.detect_only and not args.overwrite:
if os.path.isfile(args.output):
print(f"Output file ({args.output}) already exists. Use --overwrite to overwrite the existing file.")
sys.exit(1)
if (args.silence_dur < 0): raise Exception("Maximum silence duration must be >= 0.0")
if (args.non_silence_dur < 0): raise Exception("Minimum non-silence duration must be >= 0.0")
try:
from scipy.io import wavfile
using_scipy = True
except:
if args.verbose: print("Failure using 'import scipy.io.wavfile'. Using 'import wave' instead.")
import wave
using_scipy = False
if args.verbose: print(f"Inputs:\n Input File: {args.input}\n Output File: {args.output}\n Max. Silence Duration: {args.silence_dur}\n Min. Non-silence Duration: {args.non_silence_dur}")
from matplotlib import pyplot as plt
def plot(x):
plt.figure()
plt.plot(x,'o')
plt.show()
def threshold_for_channel(ch):
global data
nbins = 100
max_len = min(1024*1024*100,data.shape[0]) # limit to first 100 MiB
if len(data.shape) > 1:
x = np.abs(data[:max_len,ch]*1.0)
else:
x = np.abs(data[:max_len]*1.0)
if data.dtype==np.uint8: x -= 127
hist,edges = np.histogram(x,bins=nbins,density=True)
slope = np.abs(hist[1:] - hist[:-1])
argmax = np.argmax(slope < 0.00002)
argmax = max(0,min(argmax + args.auto_aggressiveness, len(edges)-1))
thresh = edges[argmax] + (127 if data.dtype==np.uint8 else 0)
return thresh
def auto_threshold():
global data
max_thresh = 0
channel_count = 1 if len(data.shape)==1 else data.shape[1]
for ch in range(channel_count):
max_thresh = max(max_thresh,threshold_for_channel(ch))
return max_thresh
silence_threshold = str(args.threshold).lower().strip()
if args.auto_threshold:
if args.verbose: print (f" Silence Threshold: AUTO (aggressiveness={args.auto_aggressiveness})")
else:
if "db" in silence_threshold:
silence_threshold_db = float(silence_threshold.replace("db",""))
silence_threshold = np.round(10**(silence_threshold_db/20.),6)
else:
silence_threshold = float(silence_threshold)
silence_threshold_db = 20*np.log10(silence_threshold)
if args.verbose: print (f" Silence Threshold: {silence_threshold} ({np.round(silence_threshold_db,2)} dB)")
if args.verbose: print (f" Silence Mode: {args.mode.upper()}")
if args.verbose: print("")
if args.time_it: print(f"Reading in data from {args.input}... ",end="",flush=True)
start = time.time()
if using_scipy:
sample_rate, data = wavfile.read(args.input)
input_dtype = data.dtype
Ts = 1./sample_rate
if args.auto_threshold:
silence_threshold = auto_threshold()
else:
if data.dtype != np.float32:
sampwidth = data.dtype.itemsize
if (data.dtype==np.uint8): silence_threshold += 0.5 # 8-bit unsigned PCM
scale_factor = (256**sampwidth)/2.
silence_threshold *= scale_factor
else:
handled_sampwidths = [2]
with wave.open(args.input,"rb") as wavin:
params = wavin.getparams()
if params.sampwidth in handled_sampwidths:
raw_data = wavin.readframes(params.nframes)
if params.sampwidth not in handled_sampwidths:
print(f"Unable to handle a sample width of {params.sampwidth}")
sys.exit(1)
end = time.time()
if args.time_it: print(f"complete (took {np.round(end-start,6)} seconds)")
if not using_scipy:
if args.time_it: print(f"Unpacking data... ",end="",flush=True)
start = time.time()
Ts = 1.0/params.framerate
if params.sampwidth==2: # 16-bit PCM
format_ = 'h'
data = np.frombuffer(raw_data,dtype=np.int16)
elif params.sampwidth==3: # 24-bit PCM
format_ = 'i'
print(len(raw_data))
data = np.frombuffer(raw_data,dtype=np.int32)
data = data.reshape(-1,params.nchannels) # reshape into channels
if args.auto_threshold:
silence_threshold = auto_threshold()
else:
scale_factor = (256**params.sampwidth)/2. # scale to [-1:1)
silence_threshold *= scale_factor
data = 1.0*data # convert to np.float64
end = time.time()
if args.time_it: print(f"complete (took {np.round(end-start,6)} seconds)")
silence_duration_samples = args.silence_dur / Ts
if args.verbose: print(f"Input File Duration = {np.round(data.shape[0]*Ts,6)}\n")
combined_channel_silences = None
def detect_silence_in_channels():
global combined_channel_silences
if len(data.shape) > 1:
if args.mode=="any":
combined_channel_silences = np.min(np.abs(data),axis=1) <= silence_threshold
else:
combined_channel_silences = np.max(np.abs(data),axis=1) <= silence_threshold
else:
combined_channel_silences = np.abs(data) <= silence_threshold
combined_channel_silences = np.pad(combined_channel_silences, pad_width=1,mode='constant',constant_values=0)
def get_silence_locations():
global combined_channel_silences
starts = combined_channel_silences[1:] & ~combined_channel_silences[0:-1]
ends = ~combined_channel_silences[1:] & combined_channel_silences[0:-1]
start_locs = np.nonzero(starts)[0]
end_locs = np.nonzero(ends)[0]
durations = end_locs - start_locs
long_durations = (durations > silence_duration_samples)
long_duration_indexes = np.nonzero(long_durations)[0]
if len(long_duration_indexes) > 1:
non_silence_gaps = start_locs[long_duration_indexes[1:]] - end_locs[long_duration_indexes[:-1]]
short_non_silence_gap_locs = np.nonzero(non_silence_gaps <= (args.non_silence_dur/Ts))[0]
for loc in short_non_silence_gap_locs:
if args.verbose and args.show_silence:
ns_gap_start = end_locs[long_duration_indexes[loc]] * Ts
ns_gap_end = start_locs[long_duration_indexes[loc+1]] * Ts
ns_gap_dur = ns_gap_end - ns_gap_start
print(f"Removing non-silence gap at {np.round(ns_gap_start,6)} seconds with duration {np.round(ns_gap_dur,6)} seconds")
end_locs[long_duration_indexes[loc]] = end_locs[long_duration_indexes[loc+1]]
long_duration_indexes = np.delete(long_duration_indexes, short_non_silence_gap_locs + 1)
if args.show_silence:
if len(long_duration_indexes)==0:
if args.verbose: print("No periods of silence found")
else:
if args.verbose: print("Periods of silence shown below")
fmt_str = "%-12s %-12s %-12s"
print(fmt_str % ("start","end","duration"))
for idx in long_duration_indexes:
start = start_locs[idx]
end = end_locs[idx]
duration = end - start
print(fmt_str % (np.round(start*Ts,6),np.round(end*Ts,6),np.round(duration*Ts,6)))
if args.verbose: print("")
return start_locs[long_duration_indexes], end_locs[long_duration_indexes]
def trim_data(start_locs,end_locs):
global data
if len(start_locs)==0: return
keep_at_start = int(silence_duration_samples / 2)
keep_at_end = int(silence_duration_samples - keep_at_start)
start_locs = start_locs + keep_at_start
end_locs = end_locs - keep_at_end
delete_locs = np.concatenate([np.arange(start_locs[idx],end_locs[idx]) for idx in range(len(start_locs))])
data = np.delete(data, delete_locs, axis=0)
def output_data(start_locs,end_locs):
global data
if args.verbose: print(f"Output File Duration = {np.round(data.shape[0]*Ts,6)}\n")
if args.time_it: print(f"Writing out data to {args.output}... ",end="",flush=True)
if using_scipy:
wavfile.write(args.output, sample_rate, data)
else:
packed_buf = data.astype(format_).tobytes()
with wave.open(args.output,"wb") as wavout:
wavout.setparams(params) # same params as input
wavout.writeframes(packed_buf)
start = time.time()
if not args.verbose and args.time_it: print("Detecting silence... ",end="",flush=True)
detect_silence_in_channels()
(start_locs,end_locs) = get_silence_locations()
end = time.time()
if not args.verbose and args.time_it: print(f"complete (took {np.round(end-start,6)} seconds)")
if args.detect_only:
if args.verbose: print("Not trimming, because 'detect only' flag was set")
else:
if args.time_it: print("Trimming data... ",end="",flush=True)
start = time.time()
trim_data(start_locs,end_locs)
end = time.time()
if args.time_it: print(f"complete (took {np.round(end-start,6)} seconds)")
start = time.time()
output_data(start_locs, end_locs)
end = time.time()
if args.time_it: print(f"complete (took {np.round(end-start,6)} seconds)")
如果您想要一个假设为 16 位 PCM 且没有所有额外打印语句的脚本,以及其他:
import numpy as np
from scipy.io import wavfile
# Params
(infile,outfile,threshold_db,silence_dur,non_silence_dur,mode) = ("test_stereo.wav","result.wav",-25,0.5,0.1,"all")
silence_threshold = np.round(10**(threshold_db/20.),6) * 32768 # Convert from dB to linear units and scale, assuming 16-bit PCM input
# Read data
Fs, data = wavfile.read(infile)
silence_duration_samples = silence_dur * Fs
if len(data.shape)==1: data = np.expand_dims(data,axis=1)
# Find silence
find_func = np.min if mode=="any" else np.max
combined_channel_silences = find_func(np.abs(data),axis=1) <= silence_threshold
combined_channel_silences = np.pad(combined_channel_silences, pad_width=1,mode='constant',constant_values=0)
# Get start and stop locations
starts = combined_channel_silences[1:] & ~combined_channel_silences[0:-1]
ends = ~combined_channel_silences[1:] & combined_channel_silences[0:-1]
start_locs = np.nonzero(starts)[0]
end_locs = np.nonzero(ends)[0]
durations = end_locs - start_locs
long_durations = (durations > silence_duration_samples)
long_duration_indexes = np.nonzero(long_durations)[0]
# Cut out short non-silence between silence
if len(long_duration_indexes) > 1:
non_silence_gaps = start_locs[long_duration_indexes[1:]] - end_locs[long_duration_indexes[:-1]]
short_non_silence_gap_locs = np.nonzero(non_silence_gaps <= (non_silence_dur * Fs))[0]
for loc in short_non_silence_gap_locs:
end_locs[long_duration_indexes[loc]] = end_locs[long_duration_indexes[loc+1]]
long_duration_indexes = np.delete(long_duration_indexes, short_non_silence_gap_locs + 1)
(start_locs,end_locs) = (start_locs[long_duration_indexes], end_locs[long_duration_indexes])
# Trim data
if len(long_duration_indexes) > 1:
if len(start_locs) > 0:
keep_at_start = int(silence_duration_samples / 2)
keep_at_end = int(silence_duration_samples - keep_at_start)
start_locs = start_locs + keep_at_start
end_locs = end_locs - keep_at_end
delete_locs = np.concatenate([np.arange(start_locs[idx],end_locs[idx]) for idx in range(len(start_locs))])
data = np.delete(data, delete_locs, axis=0)
# Output data
wavfile.write(outfile, Fs, data)