Python音频处理librosa-平芜编程栈

"""
Python librosa 音频处理详解
包含：音频加载、梅尔频谱图、STFT、MFCC 特征、音高检测、音频可视化
"""
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

def generate_test_tone(duration=3.0, sr=22050):
"""生成一个测试音频信号：包含两个不同频率的正弦波"""
t = np.linspace(0, duration, int(sr * duration), endpoint=False)
# 440Hz（A4 音）和 880Hz（A5 音）的混合，带淡入淡出
tone1 = 0.5 * np.sin(2 * np.pi * 440 * t)
tone2 = 0.3 * np.sin(2 * np.pi * 880 * t)
signal = tone1 + tone2
# 应用淡入淡出包络（避免点击声）
fade_len = int(sr * 0.05)
fade_in = np.linspace(0, 1, fade_len)
fade_out = np.linspace(1, 0, fade_len)
signal[:fade_len] *= fade_in
signal[-fade_len:] *= fade_out
return signal, sr

def plot_audio_features(y, sr, save_name='audio_features.png'):
"""绘制音频的各种特征图"""
plt.figure(figsize=(14, 10))

# 子图 1：原始波形
plt.subplot(4, 2, 1)
librosa.display.waveshow(y, sr=sr)
plt.title('波形图 (Waveform)')
plt.xlabel('时间 (秒)'), plt.ylabel('振幅')

# 子图 2：短时傅里叶变换 (STFT) 频谱图
plt.subplot(4, 2, 2)
D = librosa.stft(y) # STFT 返回复数矩阵
# 将复数振幅转为分贝刻度，视觉上更清晰
D_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
librosa.display.specshow(D_db, sr=sr, x_axis='time', y_axis='hz')
plt.title('STFT 频谱图 (Spectrogram)')
plt.colorbar(format='%+2.0f dB')

# 子图 3：梅尔频谱图（模拟人耳感知）
plt.subplot(4, 2, 3)
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
mel_db = librosa.power_to_db(mel_spec, ref=np.max)
librosa.display.specshow(mel_db, sr=sr, x_axis='time',
y_axis='mel', fmax=8000)
plt.title('梅尔频谱图 (Mel Spectrogram)')
plt.colorbar(format='%+2.0f dB')

# 子图 4：MFCC 特征（梅尔频率倒谱系数）
plt.subplot(4, 2, 4)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
librosa.display.specshow(mfcc, sr=sr, x_axis='time')
plt.title('MFCC 特征 (13 维)')
plt.colorbar()

# 子图 5：色度特征（Chroma，12 个音级类别）
plt.subplot(4, 2, 5)
chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=12)
librosa.display.specshow(chroma, sr=sr, x_axis='time', y_axis='chroma')
plt.title('色度特征 (Chroma)')
plt.colorbar()

# 子图 6：谱质心（Spectral Centroid）—— 音色明亮度
plt.subplot(4, 2, 6)
cent = librosa.feature.spectral_centroid(y=y, sr=sr)
times = librosa.times_like(cent, sr=sr)
plt.semilogy(times, cent.T, label='谱质心', color='b')
plt.ylabel('频率 (Hz)'), plt.xlabel('时间 (秒)')
plt.title('谱质心 (Spectral Centroid)')
plt.grid(True)

# 子图 7：零交叉率（Zero-Crossing Rate）
plt.subplot(4, 2, 7)
zcr = librosa.feature.zero_crossing_rate(y)
plt.plot(librosa.times_like(zcr, sr=sr), zcr.T, color='g')
plt.title('零交叉率 (ZCR)')
plt.xlabel('时间 (秒)'), plt.ylabel('ZCR')
plt.grid(True)

# 子图 8：RMS 能量（Root Mean Square Energy）
plt.subplot(4, 2, 8)
rms = librosa.feature.rms(y=y)
plt.plot(librosa.times_like(rms, sr=sr), rms.T, color='r')
plt.title('RMS 能量 (RMS Energy)')
plt.xlabel('时间 (秒)'), plt.ylabel('RMS')
plt.grid(True)

plt.tight_layout()
plt.savefig(save_name, dpi=150)
plt.show()

# ========== 1. 加载音频文件 ==========
# 尝试加载外部音频文件，若失败则使用生成的测试信号
audio_path = 'input_audio.wav'
try:
y, sr = librosa.load(audio_path, sr=None)
print(f"加载音频: {audio_path}, 采样率={sr}Hz, 时长={len(y)/sr:.2f}s")
except Exception:
print(f"文件 {audio_path} 不存在，使用生成的测试音调。")
y, sr = generate_test_tone(duration=3.0)
print(f"生成测试音频: 采样率={sr}Hz, 时长={len(y)/sr:.2f}s")

# ========== 2. 时域分析：波形和过零率 ==========
print(f"音频基本信息: 采样点数={len(y)}, 采样率={sr}Hz")
print(f"振幅范围: [{y.min():.4f}, {y.max():.4f}]")

# ========== 3. 频域分析：STFT 和频谱 ==========
# STFT 参数：fft 窗口=2048，hop 长度=512，窗口类型=汉宁窗
D = librosa.stft(y, n_fft=2048, hop_length=512, win_length=2048)
magnitude = np.abs(D) # 幅度谱
phase = np.angle(D) # 相位谱
print(f"STFT 矩阵形状: {D.shape} (频率 bins x 时间帧数)")

# ========== 4. 梅尔频谱图 ==========
mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
mel_db = librosa.power_to_db(mel, ref=np.max)
print(f"梅尔频谱形状: {mel.shape}")

# ========== 5. MFCC 特征提取 ==========
# MFCC 是语音识别中最常用的特征，模拟人耳听觉特性
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)
mfcc_delta = librosa.feature.delta(mfcc) # 一阶差分（速度）
mfcc_delta2 = librosa.feature.delta(mfcc, order=2) # 二阶差分（加速度）
print(f"MFCC 形状: {mfcc.shape}")
print(f"MFCC 均值: {mfcc.mean(axis=1)}")

# ========== 6. 音高检测（基频估计） ==========
# 使用自相关法（pyin 算法）估计基频 F0
f0, voiced_flag, voiced_probs = librosa.pyin(
y, fmin=librosa.note_to_hz('C2'), # 最低 65Hz
fmax=librosa.note_to_hz('C7'), # 最高 2093Hz
sr=sr
)
# 将 NaN（无声部分）填充为 0
f0 = np.nan_to_num(f0)
valid_pitches = f0[f0 > 0]
if len(valid_pitches) > 0:
print(f"检测到音高范围: {valid_pitches.min():.1f}Hz ~ "
f"{valid_pitches.max():.1f}Hz")
# 将频率转为 MIDI 音符编号
midi_notes = librosa.hz_to_midi(valid_pitches)
note_names = [librosa.midi_to_note(int(n)) for n in midi_notes[:10]]
print(f"前 10 个有效音高的音符: {note_names}")

# ========== 7. 节拍跟踪 ==========
tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
print(f"估计速度: {tempo:.1f} BPM")
if len(beats) > 0:
beat_times = librosa.frames_to_time(beats, sr=sr)
print(f"检测到 {len(beats)} 个节拍位置")

# ========== 8. 绘制所有音频特征 ==========
plot_audio_features(y, sr)
print("librosa 音频处理演示完成，涵盖时域/频域/MFCC/音高/节拍分析。")

Python音频处理librosa

到底为什么要有操作系统进程模型？

用Pandas rolling处理股票数据：从计算5日线到构建简易交易信号（附完整代码）

CaaS编码即服务：重塑开发模式，从零到一的高效架构实践

CORB-Planner：高速无人机避障轨迹规划技术解析

别再死记硬背CRF公式了！用TensorFlow 2.x手写一个命名实体识别(NER)层，从代码反推原理

别再只会拖拽了！Zotero文献管理的5个隐藏操作技巧（Shift/Ctrl键妙用）

到底为什么要有操作系统进程模型 ？

用Pandas rolling处理股票数据：从计算5日线到构建简易交易信号（附完整代码）

CaaS编码即服务：重塑开发模式，从零到一的高效架构实践

CORB-Planner：高速无人机避障轨迹规划技术解析

别再死记硬背CRF公式了！用TensorFlow 2.x手写一个命名实体识别(NER)层，从代码反推原理

别再只会拖拽了！Zotero文献管理的5个隐藏操作技巧（Shift/Ctrl键妙用）

到底为什么要有操作系统进程模型？