Source code for carat.features

# encoding: utf-8
# pylint: disable=C0103
# pylint: disable=too-many-arguments
"""
Features
========

Accentuation features
---------------------
.. autosummary::
    :toctree: generated/

    accentuation_feature
    feature_normalization
    feature_time_quantize

Feature maps
------------
.. autosummary::
    :toctree: generated/

    feature_map

Time-frequency
--------------
.. autosummary::
    :toctree: generated/

    spectrogram
    melSpectrogram

Miscellaneous
-------------
.. autosummary::
    :toctree: generated/

    generate_tatum_grid
    halfWaveRectification
    calculateDelta
    sumFeatures
"""

import numpy as np
import scipy as sp
from . import util

__all__ = ['accentuation_feature']

[docs]def accentuation_feature(signal, fs, sum_flag=True, log_flag=False, mel_flag=True, alpha=1000, maxfilt_flag=False, maxbins=3, **kwargs): """Compute accentuation feature from audio signal. Based on the log-power Mel spectrogram [1]. [1] Böck, Sebastian, and Gerhard Widmer. "Maximum filter vibrato suppression for onset detection." 16th International Conference on Digital Audio Effects, Maynooth, Ireland. 2013. This performs the following calculations to the input signal: input->STFT->(Mel scale)->(Log)->(Max filtering)->Diff->HWR->(Sum) Parenthesis denote optional steps. **Args**: - input: signal - fs: sampling rate - sum_flag (bool): true if the features are to be summed for each frame. - log_flag (bool): true if the features energy are to be converted to dB. - mel_flag (bool): true if the features are to be mapped in the Mel scale. - alpha (int): compression parameter for dB conversion - log10(alpha*abs(S)+1). - maxfilt_flag (bool): true if a maximum filtering is applied to the feature. - maxbins (int): number of frequency bins for maximum filter size - ``**kw`` : these keyword arguments are passed down to each of the functions used **Returns**: - feature (numpy array): feature values - time (numpy array): time values """ # STFT val_kw, remaining_kw = util.getValidKeywords(kwargs, spectrogram) feature, time, frequency = spectrogram(signal, fs, **val_kw) # mel scale mapping (and absolute value) if mel_flag: val_kw, remaining_kw = util.getValidKeywords(remaining_kw, melSpectrogram) feature, time, frequency = melSpectrogram(feature, time, frequency, **val_kw) else: # take the absolute value feature = sp.absolute(feature) # log magnitude (with compression parameter) if log_flag: feature = 20*sp.log10(alpha * feature + 1) # maximum filter (and difference) if maxfilt_flag: # maximum filter max_spec = sp.ndimage.filters.maximum_filter(feature, size=(maxbins, 1)) # init the diff array diff = sp.zeros(feature.shape) # calculate difference between log spec and max filtered version diff[:, 1:] = (feature[:, 1:] - max_spec[:, : -1]) # save feature data feature = diff else: # conventional difference (first order) feature = calculateDelta(feature, delta_filter_length=1) # half-wave rectification feature = halfWaveRectification(feature) # sum features if sum_flag: feature = sumFeatures(feature) # return return feature, time, frequency
[docs]def feature_map(feature, time, beats, downbeats, n_beats=4, n_tatums=4, norm_flag=True, pnorm=8, window=0.1): """Compute feature map from accentuation feature signal. Based on the feature map introduced in [1]. [1] Rocamora, Jure, Biscainho "Tools for detection and classification of piano drum patterns from candombe recordings." 9th Conference on Interdisciplinary Musicology (CIM), Berlin, Germany. 2014. The accentuation feature is organized into a feature map. First, the feature signal is time-quantized to the rhythm metric structure by considering a grid of tatum pulses equally distributed within the annotated beats. The corresponding feature value is taken as the maximum within window centered at the frame closest to each tatum instant. This yields feature vectors whose coordinates correspond to the tatum pulses of the rhythm cycle (or bar). Finally, a feature map of the cycle-length rhythmic patterns of the audio file is obtained by building a matrix whose columns are consecutive feature vectors. **Args**: - feature (numpy array): feature signal - ``**kw``: these keyword arguments are passed down to each of the functions used **Returns**: - : **Raises**: - """ normalized_feature = np.copy(feature) if norm_flag: # normalize feature values with a p-norm applied within a local window normalized_feature = feature_normalization(feature, time, beats, n_tatums=n_tatums, pnorm=pnorm) # generate tatum grid to time quantize feature values tatums = generate_tatum_grid(beats, downbeats, n_tatums) # time quantize the feature signal to the tatum grid quantized_feature = feature_time_quantize(normalized_feature, time, tatums, window=window) # reshape into a matrix whose columns are bars and its elements are tatums features_map = np.reshape(quantized_feature, (-1, n_beats*n_tatums)) return features_map, quantized_feature, tatums, normalized_feature
[docs]def feature_normalization(feature, time, beats, n_tatums=4, pnorm=8): """Local amplitude normalization of the feature signal. Based on the feature map introduced in [1] and detailed in [2]. [1] Rocamora, Jure, Biscainho "Tools for detection and classification of piano drum patterns from candombe recordings." 9th Conference on Interdisciplinary Musicology (CIM), Berlin, Germany. 2014. [2] Rocamora, Cancela, Biscainho "Information theory concepts applied to the analysis of rhythm in recorded music with recurrent rhythmic patterns." Journal of the AES, 67(4), 2019. A local amplitude normalization is carried out to preserve intensity variations of the rhythmic patterns while discarding long-term fluctuations in dynamics. A p-norm within a local window is applied. The window width is proportional to the beat period. **Args**: - feature (numpy array): feature signal values - time (numpy array): time instants of the feature values - beats (numpy array): time instants of the tactus beats - n_tatums (int): number of tatums per tactus beat - pnorm (int): p-norm order for normalization **Returns**: - : **Raises**: - norm_feature (numpy array): normalized feature signal values """ # estimate tatum period from annotations for normalization # time sample interval samps_time = time[1] - time[0] # compute tatum period in seconds from median of the tactus intervals beats_periods = beats[1:] - beats[0:-1] # tatum period in seconds tatum_period_secs = np.median(beats_periods) / n_tatums # period in samples tatum_period_samps = int(round(tatum_period_secs / samps_time)) - 1 # normalize feature norm_feature = normalize_features(feature, tatum_period_samps * pnorm, p=pnorm) return norm_feature
[docs]def generate_tatum_grid(beats, downbeats, n_tatums): """Generate tatum temporal grid from time instants of the tactus beats. A grid of tatum pulses is generated equally distributed within the given tactus beats. The grid is used to time quantize the feature signal to the rhythmic metric structure. Parameters ---------- labels_time (np.ndarray) : time instants of the tactus beats labels (list) : labels at the tactus beats (e.g. 1.1, 1.2, etc) Returns ------- tatum_time (np.ndarray) : time instants of the tatum beats """ # first and last downbeat first_downbeat = downbeats[0] last_downbeat = downbeats[-1] # index of first and last downbeat into the beats array # NOTE: we assume coincident beats and downbeats (because of our data) indx_first = int(np.where(beats == first_downbeat)[0]) indx_last = int(np.where(beats == last_downbeat)[0]) # number of tactus beats num_beats = indx_last - indx_first # the beat of the last downbeat is not counted # time instants of the tatums tatums = np.zeros(num_beats * n_tatums) # compute tatum time locations from the tactus beats for ind in range(indx_first, indx_last): # tatum period estimated from tactus beats tatum_period = (beats[ind+1] - beats[ind]) / n_tatums # a whole bar of tatum beats tatum_bar = np.array(range(n_tatums) * tatum_period + beats[ind]) # save bar of tatum beats tatums[(ind*n_tatums):((ind+1)*n_tatums)] = tatum_bar return tatums
[docs]def feature_time_quantize(feature, time, tatums, window=0.1): """Time quantization of the feature signal to a tatum grid. The feature signal is time-quantized to the rhythm metric structure by considering a grid of tatum pulses equally distributed within the tactus beats. The feature value assigned to each tatum time instant is obtained as the maximum value of the feature signal within a certain window centered at the tatum time instant. Default value for the total window lenght is 100 ms. **Args**: - feature (numpy array): feature signal values - time (numpy array): time instants of the feature values - tatums (numpy array): time instants of the tatum grid **Returns**: - : **Raises**: - quantized_feature (numpy array): time quantized feature signal values """ # number of tatum instants num_tatums = tatums.size # time quantized feature values at tatum instants quantized_feature = np.zeros(tatums.shape) # compute hop size to set time-quantization window hop_size = time[1] - time[0] # number of frames within time quantization window win_num_frames = int(window / hop_size) # half window in frames hwf = int(np.floor(win_num_frames / 2)) # check if it is even to center window if hwf % 2 != 0: hwf -= 1 # compute feature value considering a certain neighbourhood for ind in range(num_tatums): # closest feature frame to tatum instant ind_tatum = util.find_nearest(time, tatums[ind]) # get maximum feature value within a neighbourhood if ind_tatum == 0: quantized_feature[ind] = np.max(feature[ind_tatum:ind_tatum+hwf+1]) else: quantized_feature[ind] = np.max(feature[ind_tatum-hwf:ind_tatum+hwf+1]) return quantized_feature
[docs]def spectrogram(signal, fs, window_length=20e-3, hop=10e-3, windowing_function=sp.hanning, dft_length=None, zp_flag=False): """ Calculates the Short-Time Fourier Transform a signal. Given an input signal, it calculates the DFT of frames of the signal and stores them in bi-dimensional Scipy array. **Args**: - window_len (float):length of the window in seconds (must be positive). - window (callable): a callable object that receives the window length in samples and returns a numpy array containing the windowing function samples. - hop (float): frame hop between adjacent frames in seconds. - zp_flag (bool): a flag indicating if the *Zero-Phase Windowing* should be performed. **Returns**: - spec (numpy array): spectrogram data - time (numpy array): time in seconds of each frame - frequnecy (numpy array): frequency grid """ # Converting window_length and hop from seconds to number of samples: win_samps = int(round(window_length * fs)) hop_samps = max(int(round(hop * fs)), 1) spec, time, frequency = util.STFT(signal, win_samps, hop_samps, windowing_function, dft_length, zp_flag) # convert indices to seconds and Hz time /= fs frequency *= fs # return return spec, time, frequency
[docs]def melSpectrogram(in_spec, in_time, in_freq, nfilts=40, minfreq=20, maxfreq=None): """This function converts a Spectrogram with linearly spaced frequency components to the Mel scale. Given an input signal, it calculates the DFT of frames of the signal and stores them in bi-dimensional Scipy array. **Args**: - window_len (float): length of the window in seconds (must be positive). - window (callable): a callable object that receives the window length in samples and returns a numpy array containing the windowing function samples. **Returns**: - spec (numpy array): mel-spectrogram data - time (numpy array): time in seconds of each frame - frequnecy (numpy array): frequency grid """ if maxfreq is None: maxfreq = max(in_freq) (wts, frequency) = util.fft2mel(in_freq, nfilts, minfreq, maxfreq) spec = sp.dot(wts, sp.sqrt(sp.absolute(in_spec)**2)) time = in_time # return return spec, time, frequency
[docs]def halfWaveRectification(in_signal): """ Half-wave rectifies features. All feature values below zero are assigned to zero. **Args**: - input: feature object - delta_filter_length (int): length of the filter used to calculate the Delta coefficients. Must be an odd number. **Returns**: - output: numpy array **Raises**: - ValueError when the input features are complex. """ out_signal = np.copy(in_signal) if out_signal.dtype != complex: out_signal[out_signal < 0] = 0.0 else: raise ValueError('Cannot half-wave rectify a complex signal.') return out_signal
[docs]def calculateDelta(in_signal, delta_filter_length=3): """ This function calculates the delta coefficients of a given feature. **Args**: - input: input feature signal - delta_filter_length (int): length of the filter used to calculate the Delta coefficients. Must be an odd number. **Returns**: - output: output feature signal """ out_signal = np.copy(in_signal) out_signal = util.deltas(out_signal, delta_filter_length) return out_signal
[docs]def sumFeatures(in_signal): """ This function sums all features along frames. **Args**: - input: input feature signal **Returns**: - output: output feature signal """ out_signal = np.copy(in_signal) out_signal = sp.sum(out_signal, axis=0) return out_signal
def def_norm_feat_gen(data, max_period, p): """ Normalization of the feature signal using p-norm. """ if not max_period % 2: max_period += 1 ext_len = int((max_period - 1) / 2) ext_data = data[1:ext_len + 1][::-1] ext_data = sp.append(ext_data, data) ext_data = sp.append(ext_data, data[-2:-ext_len - 2:-1]) def aux(i, win_size): fac = int(win_size % 2) h_len = int(win_size / 2) aux = sp.linalg.norm(ext_data[i - h_len + ext_len : i + ext_len + h_len + fac], ord=p) return ext_data[i + ext_len] / max(aux, 1e-20) return aux def normalize_features(data, win_len, p): """ Normalization of the feature signal using p-norm. """ aux = def_norm_feat_gen(data, win_len, p) out = data.copy() for i in range(data.size): out[i] = aux(i, win_len) return out #def accentuation_feature(y, sr=22050, hop_length=512, n_fft=2048, # n_mels=128, freq_sb=None, **kwargs): # """Compute accentuation feature from audio signal. # # # Based on the log-power Mel spectrogram [1]. # # [1] Böck, Sebastian, and Gerhard Widmer. # "Maximum filter vibrato suppression for onset detection." # 16th International Conference on Digital Audio Effects, # Maynooth, Ireland. 2013. # # In current implementation win_length = n_fft (because of librosa) # The log-power Mel spectrogram is computed for the full spectrum, # i.e. up to sr/2 but fmin and fmax could be used to limit the representation. # # A frequency band to focus on. # # Parameters # ---------- # # Returns # ------- # # Examples # -------- # # Notes # ----- # """ # # if freq_sb is None: # # compute feature values for the full spectrum # feature_values = librosa.onset.onset_strength(y=y, sr=sr, n_fft=n_fft, n_mels=n_mels, # hop_length=hop_length, **kwargs) # else: # # check if two frequency values are provided # if isinstance(freq_sb, np.ndarray) and freq_sb.shape[0] == 2: # # # compute the frequency of the mel channels # n_freqs = librosa.core.time_frequency.mel_frequencies(n_mels=n_mels, fmin=0.0, # fmax=sr/2, htk=False) # # find indexes of sub-band channels # channels = [util.find_nearest(n_freqs, freq) for freq in freq_sb] # # compute feature values for the full spectrum and aggregate across a sub-band # feature_values = librosa.onset.onset_strength_multi(y=y, sr=sr, # n_fft=n_fft, # n_mels=n_mels, # hop_length=hop_length, # channels=channels, # **kwargs) # # save a single sub-band # feature_values = feature_values[0] # # # compute time instants of the feature values # times = librosa.frames_to_time(np.arange(feature_values.shape[0]), sr=sr, # n_fft=n_fft, hop_length=hop_length) # # remove offset of n_fft/2 and hop_length because of the time lag for computing differences # times = times - (n_fft/2/sr) - (hop_length/sr) # # times = (np.arange(feature_values.shape[0]) * hop_length + win_length/2) / sr # # return feature_values, times