diff --git a/cooking/python/features/__init__.py b/cooking/python/features/__init__.py deleted file mode 100644 index 9b5ed21..0000000 --- a/cooking/python/features/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .base import * diff --git a/cooking/python/features/__init__.pyc b/cooking/python/features/__init__.pyc deleted file mode 100644 index b773fc6..0000000 Binary files a/cooking/python/features/__init__.pyc and /dev/null differ diff --git a/cooking/python/features/base.py b/cooking/python/features/base.py deleted file mode 100644 index c3d7f24..0000000 --- a/cooking/python/features/base.py +++ /dev/null @@ -1,173 +0,0 @@ -# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications -# Author: James Lyons 2012 -import numpy -from features import sigproc -from scipy.fftpack import dct - -# make it python3.x compatible -try: - xrange(1) -except: - xrange=range - -def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True): - """Compute MFCC features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param numcep: the number of cepstrum to return, default 13 - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. - :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. - :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. - """ - feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) - feat = numpy.log(feat) - feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] - feat = lifter(feat,ceplifter) - if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy - return feat - -def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): - """Compute Mel-filterbank energy features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The - second return value is the energy in each frame (total energy, unwindowed) - """ - highfreq= highfreq or samplerate/2 - signal = sigproc.preemphasis(signal,preemph) - frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) - pspec = sigproc.powspec(frames,nfft) - energy = numpy.sum(pspec,1) # this stores the total energy in each frame - energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log - - fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) - feat = numpy.dot(pspec,fb.T) # compute the filterbank energies - feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log - - return feat,energy - -def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): - """Compute log Mel-filterbank energy features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. - """ - feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) - return numpy.log(feat) - -def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): - """Compute Spectral Subband Centroid features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. - """ - highfreq= highfreq or samplerate/2 - signal = sigproc.preemphasis(signal,preemph) - frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) - pspec = sigproc.powspec(frames,nfft) - pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems - - fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) - feat = numpy.dot(pspec,fb.T) # compute the filterbank energies - R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) - - return numpy.dot(pspec*R,fb.T) / feat - -def hz2mel(hz): - """Convert a value in Hertz to Mels - - :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. - :returns: a value in Mels. If an array was passed in, an identical sized array is returned. - """ - return 2595 * numpy.log10(1+hz/700.0) - -def mel2hz(mel): - """Convert a value in Mels to Hertz - - :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. - :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. - """ - return 700*(10**(mel/2595.0)-1) - -def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): - """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond - to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) - - :param nfilt: the number of filters in the filterbank, default 20. - :param nfft: the FFT size. Default is 512. - :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. - :param lowfreq: lowest band edge of mel filters, default 0 Hz - :param highfreq: highest band edge of mel filters, default samplerate/2 - :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. - """ - highfreq= highfreq or samplerate/2 - assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" - - # compute points evenly spaced in mels - lowmel = hz2mel(lowfreq) - highmel = hz2mel(highfreq) - melpoints = numpy.linspace(lowmel,highmel,nfilt+2) - # our points are in Hz, but we use fft bins, so we have to convert - # from Hz to fft bin number - bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate) - - fbank = numpy.zeros([nfilt,nfft/2+1]) - for j in xrange(0,nfilt): - for i in xrange(int(bin[j]),int(bin[j+1])): - fbank[j,i] = (i - bin[j])/(bin[j+1]-bin[j]) - for i in xrange(int(bin[j+1]),int(bin[j+2])): - fbank[j,i] = (bin[j+2]-i)/(bin[j+2]-bin[j+1]) - return fbank - -def lifter(cepstra,L=22): - """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the - magnitude of the high frequency DCT coeffs. - - :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size. - :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter. - """ - if L > 0: - nframes,ncoeff = numpy.shape(cepstra) - n = numpy.arange(ncoeff) - lift = 1+ (L/2)*numpy.sin(numpy.pi*n/L) - return lift*cepstra - else: - # values of L <= 0, do nothing - return cepstra - diff --git a/cooking/python/features/base.pyc b/cooking/python/features/base.pyc deleted file mode 100644 index fe94496..0000000 Binary files a/cooking/python/features/base.pyc and /dev/null differ diff --git a/cooking/python/features/sigproc.py b/cooking/python/features/sigproc.py deleted file mode 100644 index ea16ad0..0000000 --- a/cooking/python/features/sigproc.py +++ /dev/null @@ -1,113 +0,0 @@ -# This file includes routines for basic signal processing including framing and computing power spectra. -# Author: James Lyons 2012 - -import numpy -import math - -def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))): - """Frame a signal into overlapping frames. - - :param sig: the audio signal to frame. - :param frame_len: length of each frame measured in samples. - :param frame_step: number of samples after the start of the previous frame that the next frame should begin. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. - :returns: an array of frames. Size is NUMFRAMES by frame_len. - """ - slen = len(sig) - frame_len = int(round(frame_len)) - frame_step = int(round(frame_step)) - if slen <= frame_len: - numframes = 1 - else: - numframes = 1 + int(math.ceil((1.0*slen - frame_len)/frame_step)) - - padlen = int((numframes-1)*frame_step + frame_len) - - zeros = numpy.zeros((padlen - slen,)) - padsignal = numpy.concatenate((sig,zeros)) - - indices = numpy.tile(numpy.arange(0,frame_len),(numframes,1)) + numpy.tile(numpy.arange(0,numframes*frame_step,frame_step),(frame_len,1)).T - indices = numpy.array(indices,dtype=numpy.int32) - frames = padsignal[indices] - win = numpy.tile(winfunc(frame_len),(numframes,1)) - return frames*win - - -def deframesig(frames,siglen,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))): - """Does overlap-add procedure to undo the action of framesig. - - :param frames: the array of frames. - :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. - :param frame_len: length of each frame measured in samples. - :param frame_step: number of samples after the start of the previous frame that the next frame should begin. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. - :returns: a 1-D signal. - """ - frame_len = round(frame_len) - frame_step = round(frame_step) - numframes = numpy.shape(frames)[0] - assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' - - indices = numpy.tile(numpy.arange(0,frame_len),(numframes,1)) + numpy.tile(numpy.arange(0,numframes*frame_step,frame_step),(frame_len,1)).T - indices = numpy.array(indices,dtype=numpy.int32) - padlen = (numframes-1)*frame_step + frame_len - - if siglen <= 0: siglen = padlen - - rec_signal = numpy.zeros((1,padlen)) - window_correction = numpy.zeros((1,padlen)) - win = winfunc(frame_len) - - for i in range(0,numframes): - window_correction[indices[i,:]] = window_correction[indices[i,:]] + win + 1e-15 #add a little bit so it is never zero - rec_signal[indices[i,:]] = rec_signal[indices[i,:]] + frames[i,:] - - rec_signal = rec_signal/window_correction - return rec_signal[0:siglen] - -def magspec(frames,NFFT): - """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be NxNFFT. - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will be the magnitude spectrum of the corresponding frame. - """ - complex_spec = numpy.fft.rfft(frames,NFFT) - return numpy.absolute(complex_spec) - -def powspec(frames,NFFT): - """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be NxNFFT. - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will be the power spectrum of the corresponding frame. - """ - return 1.0/NFFT * numpy.square(magspec(frames,NFFT)) - -def logpowspec(frames,NFFT,norm=1): - """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be NxNFFT. - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 1. - :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will be the log power spectrum of the corresponding frame. - """ - ps = powspec(frames,NFFT); - ps[ps<=1e-30] = 1e-30 - lps = 10*numpy.log10(ps) - if norm: - return lps - numpy.max(lps) - else: - return lps - -def preemphasis(signal,coeff=0.95): - """perform preemphasis on the input signal. - - :param signal: The signal to filter. - :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. - :returns: the filtered signal. - """ - return numpy.append(signal[0],signal[1:]-coeff*signal[:-1]) - - - diff --git a/cooking/python/features/sigproc.pyc b/cooking/python/features/sigproc.pyc deleted file mode 100644 index fbf3dbc..0000000 Binary files a/cooking/python/features/sigproc.pyc and /dev/null differ