如何使用Octave获取MFCC特征

4
我希望创建一个Octave程序,加载音频文件(wav,flac),计算其mfcc特征并将其作为输出提供。问题在于我没有太多的Octave经验,无法让Octave加载音频文件,因此我不确定提取算法是否正确。有没有简单的方法来加载文件并获取其特征?

你到底尝试了什么,哪些是不起作用的?请注意,Octave 4.0.0是最新版本,其主要功能之一是支持音频。 - carandraug
2个回答

3

您可以在Octave中运行RASTAMAT的mfcc代码,您只需要修复一些问题,修复版本可以从这里下载。

更改是为了正确设置powspec.m中的窗口。

  WINDOW = hanning(winpts);

并修复specgram函数中与Matlab不兼容的错误


1
很难给你建议,因为你没有提供任何关于错误的信息。 - Nikolay Shmyrev
我有一个包含网站上所有m文件和mp3文件a.mp3的目录。当我尝试运行网站上的示例命令时,出现了以下错误: error: 'mp3read' undefined near line 9 column 11 命令为: [d,sr] = mp3read('a.mp3',[1 30*22050],1,2); - nstanchev
3
@NikolayShmyrev,我知道这是一个老问题,但是提供的源代码链接已经失效了。你能否提供另一个链接或在其他地方发布代码? - jotadepicas
1
发现了另一个由Sunil Kopparapu博士编写的代码:https://sites.google.com/site/sunilkopparapu/Home/asks(在“在Octave中计算MFCC”视频中引用:https://www.youtube.com/watch?v=oTI6c87M3Gs) - jotadepicas
2
Dropbox链接失效了。404! - Indra
显示剩余9条评论

2
查看Octave函数以计算MFCC,网址为https://github.com/jagdish7908/mfcc-octave 有关计算MFCC的详细理论步骤,请参考http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/
 function frame = create_frames(y, Fs, Fsize, Fstep)
  N = length(y);
  % divide the signal into frames with overlap = framestep
  samplesPerFrame = floor(Fs*Fsize);
  samplesPerFramestep = floor(Fs*Fstep);
  i = 1;
  frame = [];
  while(i <= N-samplesPerFrame)
    frame = [frame y(i:(i+samplesPerFrame-1))];
    i = i+samplesPerFramestep;
  endwhile
  return 
 endfunction

function ans = hz2mel(f)
  ans = 1125*log(1+f/700);
  return
 endfunction

 function ans = mel2hz(f)
  ans = 700*(exp(f/1125) - 1);
  return
 endfunction

function bank = melbank(n, min, max, sr)
  % n = number of banks
  % min = min frequency in hertz
  % max = max frequency in hertz 
  % convert the min and max freq in mel scale
  NFFT = 512;
  % figure out bin value of min and max freq
  minBin = floor((NFFT)*min/(sr/2));
  maxBin = floor((NFFT)*max/(sr/2));
  % convert the min, max in mel scale
  min_mel = hz2mel(min);
  max_mel = hz2mel(max);
  m = [min_mel:(max_mel-min_mel)/(n+2-1):max_mel];
  %disp(m);
  h = mel2hz(m);
  % replace frequencies in h with thier respective bin values
  fbin = floor((NFFT)*h/(sr/2));

  %disp(h);
  % create triangular melfilter vectors
  H = zeros(NFFT,n);
  for vect = 2:n+1
    for k = minBin:maxBin
      
      if k >= fbin(vect-1) && k <= fbin(vect)
        H(k,vect) = (k-fbin(vect-1))/(fbin(vect)-fbin(vect-1));  
      elseif k >= fbin(vect) && k <= fbin(vect+1)
        H(k,vect) = (fbin(vect+1) - k)/(fbin(vect+1)-fbin(vect));
      endif
      
    endfor
  endfor
  bank = H;
  return
 endfunction     

clc;
clear all;
close all;
pkg load signal;

% record audio
Fs = 44100;
y = record(3,44100);
% OR %
% Load existing file
%[y, Fs] = wavread('../FILE_PATH/');
%y = y(44100:2*44100);
 
 % create mel filterbanks
 minFreq = 500;   % minimum cutoff frequency in Hz
 maxFreq = 10000;   % maximum cutoff frequency in Hz
% melbank(number_of_banks, minFreq, mazFreq, sampling_rate)
 foo = melbank(30,minFreq,maxFreq,Fs);

 % create frames
 frames = create_frames(y, Fs, 0.025, 0.010);
 % calculate periodogram of each frame
 NF = length(frames(1,:));
 [P,F] = periodogram(frames(:,1),[], 1024, Fs);
 % apply mel filters to the power spectra
 P = foo.*P(1:512);
 % sum the energy in each filter and take the logarithm
 P = log(sum(P));
 % take the DCT of the log filterbank energies
 % discard the first coeff 'cause it'll be -Inf after taking log
 L = length(P);
 P = dct(P(2:L));
 PXX = P;

 for i = 2:NF
  P = periodogram(frames(:,i),[], 1024, Fs);
   % apply mel filters to the power spectra
  P = foo.*P(1:512);
  % sum the energy in each filter and take the logarithm
  P = log(sum(P));
  % take the DCT of the log filterbank energies
  % discard the first coeff 'cause it'll be -Inf after taking log
  P = dct(P(2:L));
  % coeffients are stacked row wise for each frame
  PXX = [PXX; P];
 endfor
 % stack the coeffients column wise
 PXX = PXX';
 plot(PXX);

1
欢迎来到SO!不要发布指向网站的链接,因为这些链接可能会在未来失效或被删除。相反,请解释解决方案。 - Abhishek Dutt

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接