Question

我想比较并识别两个声音流。我创建了自己的算法，但它不能像我想的那样工作。我试着比较一些字母＆＃34; A，B，C＆＃34;用＆＃34; D，E，F＆＃34;或单词＆＃34; facebook＆＃34;用＆＃34;音乐＆＃34;和算法给出了这个比较的真实值，但这些不是相同的词。我的算法是如此不精确，或者是使用笔记本电脑麦克风录制的声音质量的原因？

我比较算法的概念：我从一个流中获取100个样本（它可以在轨道的中间）并且以指定的方式在每个第二个流中循环检查：首先是0-99个样本，1-100个，2-101个等。我的程序只有很少的轨道可以与一个输入轨道进行比较，所以我的算法可以从每个轨道获得最佳解决方案（轨道中最相似的样本）不幸的是它得到了错误的结果。

using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.ComponentModel;
using System.IO;
using System.Runtime.CompilerServices;
using System.Windows;
using Controller.Annotations;
using NAudio.Wave;


namespace Controller.Models
{
    class DecompositionOfSound
    {
        private int _numberOfSimilarSamples;
        private String _stream;
    public string Stream
    {
        get { return _stream; }
        set { _stream = value; }
    }

    public int IloscPodobnychProbek
    {
        get { return _numberOfSimilarSamples; }
        set { _numberOfSimilarSamples = value; }
    }

    public DecompositionOfSound(string stream)
    {
        _stream = stream;
        SaveSamples(stream);
    }

    private void SaveSamples(string stream)
    {
        var wave = new WaveChannel32(new WaveFileReader(stream));
        Samples = new byte[wave.Length];
        wave.Read(Samples, 0, (int) wave.Length);
    }

    private byte[] _samples;
    public byte[] Samples
    {
        get { return _samples; }
        set { _samples = value; }
    }
}

class Sample: INotifyPropertyChanged
{
    #region Cechy
    private IList<DecompositionOfSound> _listoOfSoundSamples = new ObservableCollection<DecompositionOfSound>();
    private string[] _filePaths;
    #endregion

    #region Property
    public string[] FilePaths
    {
        get { return _filePaths; }
        set { _filePaths = value; }
    }
    public IList<DecompositionOfSound> ListaSciezekDzwiekowych
    {
        get { return _listoOfSoundSamples; }
        set { _listoOfSoundSamples = value; }
    }
    #endregion

    #region Metody
    public Sample()
    {
        LoadSamples(); // przy każdym nowym nagraniu należy zaktualizować !!!
    }
    public void DisplayMatchingOfSamples()
    {
        foreach (var decompositionOfSound in ListaSciezekDzwiekowych)
        {
            MessageBox.Show(decompositionOfSound.IloscPodobnychProbek.ToString());
        }
    }
    public DecompositionOfSound BestMatchingOfSamples()
    {
        int max=0;
        DecompositionOfSound referenceToObject = null;
        foreach (var numberOfMatching in _listoOfSoundSamples)
        {
            if (numberOfMatching.IloscPodobnychProbek > max)
            {
                max = numberOfMatching.IloscPodobnychProbek;
                referenceToObject = numberOfMatching;
            }
        }
        return referenceToObject;

    }
    public void LoadSamples()
    {
        int i = 0;

        _filePaths = Directory.GetFiles(@"Samples","*.wav");
        while (i < _filePaths.Length)
        {
            ListaSciezekDzwiekowych.Add(new DecompositionOfSound(_filePaths[i]));
            i++;
        }
    }
    public void CheckMatchingOfWord(byte[] inputSound,double eps)
    {            
        foreach (var probka in _listoOfSoundSamples)
        {
            CompareBufforsOfSamples(inputSound, probka, eps);
        }
    }
    public void CheckMatchingOfWord(String inputSound,int iloscProbek, double eps)
    {

        var wave = new WaveChannel32(new WaveFileReader(inputSound));
        var samples = new byte[wave.Length];
        wave.Read(samples, 0, (int)wave.Length);
        var licznik = 0;
        var samplesTmp = new byte[iloscProbek];
        while (licznik < iloscProbek)
        {
            samplesTmp[licznik] = samples[licznik + (wave.Length >> 1)];
            licznik++;
        }
        foreach (var probka in _listoOfSoundSamples)
        {
            CompareBufforsOfSamples(samplesTmp, probka, eps);
        }
    }
    private void CompareBufforsOfSamples(byte[] inputSound, DecompositionOfSound samples, double eps)
    {
        int max = 0;
        for (int i = 0; i < (samples.Samples.Length - inputSound.Length); i++)
        {
            int counter = 0;
            for (int j = 0; j < inputSound.Length; j++)
            {
                if (inputSound[j] * eps <= samples.Samples[i + j] && 
                (inputSound[j] + inputSound[j] *(1 - eps)) >= samples.Samples[i + j])
                {
                    counter++;
                }
            }
            if (counter > max) max = counter;
        }
        samples.IloscPodobnychProbek = max;
    }   
    #endregion

    #region INotifyPropertyChange
    public event PropertyChangedEventHandler PropertyChanged;

    [NotifyPropertyChangedInvocator]
    protected virtual void OnPropertyChanged([CallerMemberName] string propertyName = null)
    {
        PropertyChangedEventHandler handler = PropertyChanged;
        if (handler != null) handler(this, new PropertyChangedEventArgs(propertyName));
    }
    #endregion
}

在对所有声音样本进行共同映射时，算法会找到匹配样本数最多的音轨，但它不是正确的记录。我对这两个记录的比较是否有意义以及如何修复它以获得预期结果。你想帮我找到这个问题的解决方案吗？抱歉我的英文。

亲切的问候

Answer 1

您根本无法对录音进行样本级别比较以确定您的匹配。即使在同一台计算机上同一个人所说同一个单词的两个录音 - 即：每个细节完全相同 - 记录的样本也会有所不同。数字音频就是这样。听起来可能相同，但实际录制的样本不匹配。

语音到文本并不简单，也不是语音识别（即：从语音中检查一个人的身份）。

您需要检查录音的频率曲线，而不是样本。自然语音中的各种声音具有不同的频率分布。 Sibilants - s声音 - 具有较高的频率分布，因此容易发现 - 这就是为什么他们过去常常使用sibilant检测旧yes / no响应检测电话系统。

您可以通过对样本块使用快速傅里叶变换来获取波形的频率分布。运行音频流并进行一系列FFT以获得波形频率的2D地图，然后寻找有趣的东西，如sibilants（大量高频，极低频率）。

当然，你可以使用一个基于网络的语音到文本API。

算法比较样本我做错了什么？

1 个答案: