Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • main
  • TISMIR
  • WASPAA23
  • vPhD
4 results

Target

Select target project
  • a23marmo/autosimilarity_segmentation
1 result
Select Git revision
  • main
  • TISMIR
  • WASPAA23
  • vPhD
4 results
Show changes
Commits on Source (9)
......@@ -2,9 +2,9 @@ from . import autosimilarity_computation
from . import barwise_input
from . import data_manipulation
from . import CBM_algorithm
from . import foote_novelty
#from . import foote_novelty
from .model import current_plot
from .model import errors
from .model import features
from .model import signal_to_spectrogram
from .model import display_results
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 14 16:30:31 2022
@author: amarmore
Module used to compute autosimilarity matrices.
"""
import as_seg.model.errors as err
import numpy as np
import sklearn.metrics.pairwise as pairwise_distances
import warnings
def switch_autosimilarity(an_array, similarity_type, gamma = None, normalise = True):
"""
High-level function to compute the autosimilarity of this matrix.
Expects a matrix of shape (Bars, Feature representation).
Computes it with different possible similarity function s_{x_i,x_j} (given two bars denoted as x_i and x_j):
- "cosine" for the cosine similarity, i.e. the normalised dot product:
.. math::
s_{x_i,x_j} = \\frac{\langle x_i, x_j \rangle}{||x_i|| ||x_j||}
-"covariance" for a covariance similarity,
i.e. the dot product of centered features:
.. math::
s_{x_i,x_j} = \langle x_i - \hat{x}, x_j - \hat{x} \rangle
-"rbf" for the Radial Basis Function similarity,
i.e. the exponent of the opposite of the euclidean distance between features:
.. math::
s_{x_i,x_j} = \\exp^{-\\gamma ||x_i - x_j||_2}
The euclidean distance can be the distance between the normalised features.
Gamma is a parameter.
See rbf_kernel from scikit-learn for more details.
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity will be computed.
Expected to be of shape (Bars, Feature representation).
similarity_type : string
Either "cosine", "covariance" or "rbf".
It represents the similarity function used for computing the autosimilarity.
gamma : positive float, optional
The gamma parameter in the rbf function, only used for the "rbf" similarity.
The default is None, meaning that it is computed as function of the standard deviation,
see get_gamma_std() for more details.
normalise : boolean, optional
Whether features should be normalised or not.
Normalisation depends on the similarity function.
The default is True.
Returns
-------
numpy array
Autosimilarity matrix of the input an_array.
"""
if similarity_type.lower() == "cosine":
return get_cosine_autosimilarity(an_array)#, normalise = normalise)
elif similarity_type.lower() == "covariance":
return get_covariance_autosimilarity(an_array, normalise = normalise)
elif similarity_type.lower() == "rbf":
return get_rbf_autosimilarity(an_array, gamma, normalise = normalise)
elif similarity_type.lower() == "centered_rbf":
return get_centered_rbf_autosimilarity(an_array, gamma, normalise = normalise)
else:
raise err.InvalidArgumentValueException(f"Incorrect similarity type: {similarity_type}. Should be cosine, covariance or rbf.")
def l2_normalise_barwise(an_array):
"""
Normalises the array barwise (i.e., in its first dimension) by the l_2 norm.
Null values are replaced by the small positive value of 10^{-10}.
Parameters
----------
an_array : numpy array
The array which needs to be normalised.
Returns
-------
numpy array
The normalised array.
"""
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="invalid value encountered in true_divide") # Avoiding to show the warning, as it's handled, not te confuse the user.
an_array_T = an_array.T/np.linalg.norm(an_array, axis = 1)
an_array_T = np.where(np.isnan(an_array_T), 1e-10, an_array_T) # Replace null lines, avoiding future errors in handling values.
return an_array_T.T
def get_cosine_autosimilarity(an_array):#, normalise = True):
"""
Computes the autosimilarity matrix with the cosine similarity function.
The cosine similarity function is the normalised dot product between two bars, i.e.:
.. math::
s_{x_i,x_j} = \\frac{\langle x_i, x_j \rangle}{||x_i|| ||x_j||}
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity os to compute.
Expected to be of shape (Bars, Feature representation).
Returns
-------
numpy array
The autosimilarity of this array, with the cosine similarity function.
"""
if type(an_array) is list:
this_array = np.array(an_array)
else:
this_array = an_array
#if normalise:
this_array = l2_normalise_barwise(this_array)
return this_array@this_array.T
def get_covariance_autosimilarity(an_array, normalise = True):
"""
Computes the autosimilarity matrix, where the similarity function is the covariance.
The covariance similarity function corresponds to the dot product of centered features:
.. math::
s_{x_i,x_j} = \langle x_i - \hat{x}, x_j - \hat{x} \rangle
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity will be computed.
normalise : boolean, optional
Whether features should be normalised or not.
Normalisation here means that each centered feature is normalised by its norm.
The default is True.
Returns
-------
numpy array
The covariance autosimilarity of this array.
"""
if type(an_array) is list:
this_array = np.array(an_array)
else:
this_array = an_array
this_array = this_array - this_array.mean(axis=0) # centering, i.e. subtracting the average value row-wise
if normalise:
this_array = l2_normalise_barwise(this_array)
return this_array@this_array.T
def get_rbf_autosimilarity(an_array, gamma = None, normalise = True):
"""
Computes the autosimilarity matrix, where the similarity function is the Radial Basis Function (RBF).
The RBF corresponds to the exponent of the opposite of the euclidean distance between features:
.. math::
s_{x_i,x_j} = \\exp^{-\\gamma ||x_i - x_j||_2}
The RBF is computed via scikit-learn.
The default gamma value is computed in function get_gamma_std(), refer to that function for further details.
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity will be computed.
gamma : positive float, optional
The gamma parameter in the rbf function.
The default is None, meaning that it is computed as function of the standard deviation,
see get_gamma_std() for more details.
normalise : boolean, optional
Whether features should be normalised or not.
Normalisation here means that the euclidean norm is computed between normalised vectors.
The default is True.
Returns
-------
numpy array
The RBF autosimilarity of this array.
"""
if type(an_array) is list:
this_array = np.array(an_array)
else:
this_array = an_array
if gamma == None:
gamma = get_gamma_std(this_array, scaling_factor = 1, no_diag = True, normalise = normalise)
if normalise:
this_array = l2_normalise_barwise(this_array)
return pairwise_distances.rbf_kernel(this_array, gamma = gamma)
def get_centered_rbf_autosimilarity(an_array, gamma = None, normalise = True):
"""
TODO
"""
if type(an_array) is list:
this_array = np.array(an_array)
else:
this_array = an_array
this_array = this_array - this_array.mean(axis=0) # centering, i.e. subtracting the average value row-wise
if gamma == None:
gamma = get_gamma_std(this_array, scaling_factor = 1, no_diag = True, normalise = normalise)
if normalise:
this_array = l2_normalise_barwise(this_array)
return pairwise_distances.rbf_kernel(this_array, gamma = gamma)
def get_gamma_std(an_array, scaling_factor = 1, no_diag = True, normalise = True):
"""
Default value for the gamma in the RBF similarity function.
This default value is proportional to the inverse of the standard deviation of the values, more experiments should be made to fit it.
For now, it has been set quite empirically.
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity will be computed.
scaling_factor : positive float, optional
Weigthing parameter, relating to the inverse of the standard deviation.
The default is 1.
no_diag : boolen, optional
Whether the diagonal values (self similarity values) should be discarded (True) or taken into account (False).
The default is True.
normalise : boolean, optional
Whether features should be normalised or not.
Normalisation here means that the euclidean norm is computed between normalised vectors.
The default is True.
Returns
-------
gamma : float
The gamma parameter in the RBF similarity function.
"""
if normalise:
an_array = l2_normalise_barwise(an_array)
euc_dist = pairwise_distances.euclidean_distances(an_array)
if not no_diag:
return scaling_factor/(2*np.std(euc_dist))
else:
for i in range(len(euc_dist)):
euc_dist[i,i] = float('NaN')
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 14 16:30:31 2022
@author: amarmore
Module used to compute autosimilarity matrices.
"""
import as_seg.model.errors as err
import numpy as np
import sklearn.metrics.pairwise as pairwise_distances
import warnings
eps = 1e-10
def switch_autosimilarity(an_array, similarity_type, gamma = None, normalise = True):
"""
High-level function to compute the autosimilarity of this matrix.
Expects a matrix of shape (Bars, Feature representation).
Computes it with different possible similarity function s_{x_i,x_j} (given two bars denoted as x_i and x_j):
- "cosine" for the cosine similarity, i.e. the normalised dot product:
.. math::
s_{x_i,x_j} = \\frac{\langle x_i, x_j \rangle}{||x_i|| ||x_j||}
-"covariance" for a covariance similarity,
i.e. the dot product of centered features:
.. math::
s_{x_i,x_j} = \langle x_i - \hat{x}, x_j - \hat{x} \rangle
-"rbf" for the Radial Basis Function similarity,
i.e. the exponent of the opposite of the euclidean distance between features:
.. math::
s_{x_i,x_j} = \\exp^{-\\gamma ||x_i - x_j||_2}
The euclidean distance can be the distance between the normalised features.
Gamma is a parameter.
See rbf_kernel from scikit-learn for more details.
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity will be computed.
Expected to be of shape (Bars, Feature representation).
similarity_type : string
Either "cosine", "covariance" or "rbf".
It represents the similarity function used for computing the autosimilarity.
gamma : positive float, optional
The gamma parameter in the rbf function, only used for the "rbf" similarity.
The default is None, meaning that it is computed as function of the standard deviation,
see get_gamma_std() for more details.
normalise : boolean, optional
Whether features should be normalised or not.
Normalisation depends on the similarity function.
The default is True.
Returns
-------
numpy array
Autosimilarity matrix of the input an_array.
"""
if similarity_type.lower() == "cosine":
return get_cosine_autosimilarity(an_array)#, normalise = normalise)
elif similarity_type.lower() == "autocorrelation" or similarity_type.lower() == "covariance":
return get_autocorrelation_autosimilarity(an_array, normalise = normalise)
elif similarity_type.lower() == "rbf":
return get_rbf_autosimilarity(an_array, gamma, normalise = normalise)
elif similarity_type.lower() == "centered_rbf":
return get_centered_rbf_autosimilarity(an_array, gamma, normalise = normalise)
else:
raise err.InvalidArgumentValueException(f"Incorrect similarity type: {similarity_type}. Should be cosine, covariance or rbf.")
def l2_normalise_barwise(an_array):
"""
Normalises the array barwise (i.e., in its first dimension) by the l_2 norm.
Null values are replaced by the small positive value of 10^{-10}.
Parameters
----------
an_array : numpy array
The array which needs to be normalised.
Returns
-------
numpy array
The normalised array.
"""
norm = np.linalg.norm(an_array, axis = 1)
an_array_T = np.transpose(an_array)
out = np.inf * np.ones_like(an_array_T)
np.divide(an_array_T, norm, out = out, where=norm!=0)
an_array_T = np.where(np.isinf(out), eps, out)
return np.transpose(an_array_T)
def get_cosine_autosimilarity(an_array):#, normalise = True):
"""
Computes the autosimilarity matrix with the cosine similarity function.
The cosine similarity function is the normalised dot product between two bars, i.e.:
.. math::
s_{x_i,x_j} = \\frac{\langle x_i, x_j \rangle}{||x_i|| ||x_j||}
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity os to compute.
Expected to be of shape (Bars, Feature representation).
Returns
-------
numpy array
The autosimilarity of this array, with the cosine similarity function.
"""
if type(an_array) is list:
this_array = np.array(an_array)
else:
this_array = an_array
#if normalise:
this_array = l2_normalise_barwise(this_array)
return this_array@this_array.T
def get_covariance_autosimilarity(an_array, normalise = True):
"""
Note: deprecated. The name of the matrix became "Autocorrelation" in the TISMIR version of the paper.
"""
return get_autocorrelation_autosimilarity(an_array, normalise = normalise)
def get_autocorrelation_autosimilarity(an_array, normalise = True):
"""
Computes the autosimilarity matrix, where the similarity function is the autocorrelation.
The autocorrelation similarity function corresponds to the dot product of centered features:
.. math::
s_{x_i,x_j} = \langle x_i - \hat{x}, x_j - \hat{x} \rangle
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity will be computed.
normalise : boolean, optional
Whether features should be normalised or not.
Normalisation here means that each centered feature is normalised by its norm.
The default is True.
Returns
-------
numpy array
The autocorrelation autosimilarity of this array.
"""
if type(an_array) is list:
this_array = np.array(an_array)
else:
this_array = an_array
this_array = this_array - this_array.mean(axis=0) # centering, i.e. subtracting the average value row-wise
if normalise:
this_array = l2_normalise_barwise(this_array)
return this_array@this_array.T
def get_rbf_autosimilarity(an_array, gamma = None, normalise = True):
"""
Computes the autosimilarity matrix, where the similarity function is the Radial Basis Function (RBF).
The RBF corresponds to the exponent of the opposite of the euclidean distance between features:
.. math::
s_{x_i,x_j} = \\exp^{-\\gamma ||x_i - x_j||_2}
The RBF is computed via scikit-learn.
The default gamma value is computed in function get_gamma_std(), refer to that function for further details.
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity will be computed.
gamma : positive float, optional
The gamma parameter in the rbf function.
The default is None, meaning that it is computed as function of the standard deviation,
see get_gamma_std() for more details.
normalise : boolean, optional
Whether features should be normalised or not.
Normalisation here means that the euclidean norm is computed between normalised vectors.
The default is True.
Returns
-------
numpy array
The RBF autosimilarity of this array.
"""
if type(an_array) is list:
this_array = np.array(an_array)
else:
this_array = an_array
if gamma == None:
gamma = get_gamma_std(this_array, scaling_factor = 1, no_diag = True, normalise = normalise)
if normalise:
this_array = l2_normalise_barwise(this_array)
return pairwise_distances.rbf_kernel(this_array, gamma = gamma)
def get_centered_rbf_autosimilarity(an_array, gamma = None, normalise = True):
"""
TODO
"""
if type(an_array) is list:
this_array = np.array(an_array)
else:
this_array = an_array
this_array = this_array - this_array.mean(axis=0) # centering, i.e. subtracting the average value row-wise
if gamma == None:
gamma = get_gamma_std(this_array, scaling_factor = 1, no_diag = True, normalise = normalise)
if normalise:
this_array = l2_normalise_barwise(this_array)
return pairwise_distances.rbf_kernel(this_array, gamma = gamma)
def get_gamma_std(an_array, scaling_factor = 1, no_diag = True, normalise = True):
"""
Default value for the gamma in the RBF similarity function.
This default value is proportional to the inverse of the standard deviation of the values, more experiments should be made to fit it.
For now, it has been set quite empirically.
Parameters
----------
an_array : numpy array
The array/matrix seen as array which autosimilarity will be computed.
scaling_factor : positive float, optional
Weigthing parameter, relating to the inverse of the standard deviation.
The default is 1.
no_diag : boolen, optional
Whether the diagonal values (self similarity values) should be discarded (True) or taken into account (False).
The default is True.
normalise : boolean, optional
Whether features should be normalised or not.
Normalisation here means that the euclidean norm is computed between normalised vectors.
The default is True.
Returns
-------
gamma : float
The gamma parameter in the RBF similarity function.
"""
if normalise:
an_array = l2_normalise_barwise(an_array)
euc_dist = pairwise_distances.euclidean_distances(an_array)
if not no_diag:
return scaling_factor/(2*np.std(euc_dist))
else:
for i in range(len(euc_dist)):
euc_dist[i,i] = float('NaN')
return scaling_factor/(2*np.nanstd(euc_dist))
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 14 18:34:29 2021
@author: amarmore
Module used to handle compute the Barwise TF matrix, presented in [1]
(Barwise TF matrix: a 2D representation of barwise features,
each feature representing Time-Frequency content, where time is expressed at barscale)
See [1 - Chapter 2.4] or [2] for more information.
References
----------
[1] Unsupervised Machine Learning Paradigms for the Representation of Music Similarity and Structure,
PhD Thesis Marmoret Axel
(not uploaded yet but will be soon!)
(You should check the website hal.archives-ouvertes.fr/ in case this docstring is not updated with the reference.)
[2] Marmoret, A., Cohen, J.E, and Bimbot, F., "Barwise Compression Schemes
for Audio-Based Music Structure Analysis"", in: 19th Sound and Music Computing Conference,
SMC 2022, Sound and music Computing network, 2022.
"""
import as_seg.data_manipulation as dm
import as_seg.model.errors as err
import numpy as np
import tensorly as tl
import librosa
# %% Tensors barwise spectrograms construction
# !!! Be extremely careful with the organization of modes, which can be either Frequency-Time at barscale-Bars (FTB) or Bars-Frequency-Time at barscale (BFT) depending on the method.
def tensorize_barwise_BFT(spectrogram, bars, hop_length_seconds, subdivision):
"""
Returns a 3rd order tensor-spectrogram from the original spectrogram and bars starts and ends.
The order of modes is Bars-Frequency-Time at barscale (BFT).
Must be used for SSAE and the computtion of Barwise TF matrix.
Each bar in the tensor-spectrogram contains the same number of frames, define by the "subdivision" parameter.
These frames are selected from an oversampled spectrogram, adapting to the specific size of each bar.
See [1] for details.
Parameters
----------
spectrogram : list of list of floats or numpy array
The spectrogram to return as a tensor-spectrogram.
bars : list of tuples
List of the bars (start, end), in seconds, to cut the spectrogram at bar delimitation.
hop_length_seconds : float
The hop_length, in seconds.
subdivision : integer
The number of subdivision of the bar to be contained in each slice of the tensor.
Returns
-------
np.array tensor
The tensor-spectrogram as a np.array.
"""
barwise_spec = []
bars_idx = dm.segments_from_time_to_frame_idx(bars[1:], hop_length_seconds)
for idx, beats in enumerate(bars_idx):
t_0 = beats[0]
t_1 = beats[1]
samples = [int(round(t_0 + k * (t_1 - t_0)/subdivision)) for k in range(subdivision)]
if len(samples) != len(set(samples)): # Check for repetitions
raise err.ToDebugException("The subdivision is too large, it leads to repeated samples chosen in the bar!")
if samples[-1] < spectrogram.shape[1]:
barwise_spec.append(spectrogram[:,samples])
return np.array(barwise_spec)
def tensorize_barwise_FTB(spectrogram, bars, hop_length_seconds, subdivision):
#(careful: different mode organization than previous one: here, this is Frequency-Time-Bars)
"""
Returns a 3rd order tensor-spectrogram from the original spectrogram and bars starts and ends.
The order of modes is Frequency-Time at barscale-Bars (FTB).
Must be used for NTD.
Each bar in the tensor-spectrogram contains the same number of frames, define by the "subdivision" parameter.
These frames are selected from an oversampled spectrogram, adapting to the specific size of each bar.
See [1, Chap 2.4.2] for details.
Parameters
----------
spectrogram : list of list of floats or numpy array
The spectrogram to return as a tensor-spectrogram.
bars : list of tuples
List of the bars (start, end), in seconds, to cut the spectrogram at bar delimitation.
hop_length_seconds : float
The hop_length, in seconds.
subdivision : integer
The number of subdivision of the bar to be contained in each slice of the tensor.
Returns
-------
tensorly tensor
The tensor-spectrogram as a tensorly tensor.
"""
freq_len = spectrogram.shape[0]
bars_idx = dm.segments_from_time_to_frame_idx(bars[1:], hop_length_seconds)
samples_init = [int(round(bars_idx[0][0] + k * (bars_idx[0][1] - bars_idx[0][0])/subdivision)) for k in range(subdivision)]
tens = np.array(spectrogram[:,samples_init]).reshape(freq_len, subdivision, 1)
for bar in bars_idx[1:]:
t_0 = bar[0]
t_1 = bar[1]
samples = [int(round(t_0 + k * (t_1 - t_0)/subdivision)) for k in range(subdivision)]
if samples[-1] < spectrogram.shape[1]:
current_bar_tensor_spectrogram = spectrogram[:,samples].reshape(freq_len, subdivision,1)
tens = np.append(tens, current_bar_tensor_spectrogram, axis = 2)
else:
break
return tl.tensor(tens)#, dtype=tl.float32)
# %% Matrix barwise spectrograms handling
def barwise_TF_matrix(spectrogram, bars, hop_length_seconds, subdivision):
"""
Barwise TF matrix, a 2D representation of Barwise spectrograms as Time-Frequency vectors.
See [1] for details.
Parameters
----------
spectrogram : list of list of floats or numpy array
The spectrogram to return as a tensor-spectrogram.
bars : list of tuples
List of the bars (start, end), in seconds, to cut the spectrogram at bar delimitation.
hop_length_seconds : float
The hop_length, in seconds.
subdivision : integer
The number of subdivision of the bar to be contained in each slice of the tensor.
Returns
-------
np.array
The Barwise TF matrix, of sizes (b, tf).
"""
tensor_spectrogram = tensorize_barwise_BFT(spectrogram, bars, hop_length_seconds, subdivision)
return tl.unfold(tensor_spectrogram, 0)
# %% Vector barwise spectrogram handling
def TF_vector_to_TF_matrix(vector, frequency_dimension, subdivision):
"""
Encapsulating the conversion from a Time-Frequency vector to a Time-Frequency matrix (spectrogram)
Parameters
----------
vector : np.array
A Time-Frequency vector (typically a row in the Barwise TF matrix).
frequency_dimension : positive integer
The size of the frequency dimension
(number of components in this dimension).
subdivision : positive integer
The size of the time dimension at the bar scale
(number of time components in each bar, defined as parameter when creating the Barwise TF matrix).
Returns
-------
np.array
A Time-Frequency matrix (spectrogram) of size (frequency_dimension, subdivision).
"""
assert frequency_dimension*subdivision == vector.shape[0]
return tl.fold(vector, 0, (frequency_dimension,subdivision))
def beat_synchronize_msaf(spectrogram, frame_times, beat_frames, beat_times):
# Make beat synchronous
beatsync_feats = librosa.util.utils.sync(spectrogram.T, beat_frames, pad=True).T
# Assign times (and add last time if padded)
beatsync_times = np.copy(beat_times)
if beatsync_times.shape[0] != beatsync_feats.shape[0]:
beatsync_times = np.concatenate((beatsync_times,
[frame_times[-1]]))
return beatsync_feats, beatsync_times
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 14 18:34:29 2021
@author: amarmore
Module used to handle compute the Barwise TF matrix, presented in [1]
(Barwise TF matrix: a 2D representation of barwise features,
each feature representing Time-Frequency content, where time is expressed at barscale)
See [1 - Chapter 2.4] or [2] for more information.
References
----------
[1] Unsupervised Machine Learning Paradigms for the Representation of Music Similarity and Structure,
PhD Thesis Marmoret Axel
(not uploaded yet but will be soon!)
(You should check the website hal.archives-ouvertes.fr/ in case this docstring is not updated with the reference.)
[2] Marmoret, A., Cohen, J.E, and Bimbot, F., "Barwise Compression Schemes
for Audio-Based Music Structure Analysis"", in: 19th Sound and Music Computing Conference,
SMC 2022, Sound and music Computing network, 2022.
"""
import as_seg.data_manipulation as dm
import as_seg.model.errors as err
import numpy as np
import tensorly as tl
import librosa
# %% Spectrograms to tensors
# !!! Be extremely careful with the organization of modes, which can be either Frequency-Time at barscale-Bars (FTB) or Bars-Frequency-Time at barscale (BFT) depending on the method.
def tensorize_barwise_BFT(spectrogram, bars, hop_length_seconds, subdivision, subset_nb_bars = None):
"""
Returns a 3rd order tensor-spectrogram from the original spectrogram and bars starts and ends.
The order of modes is Bars-Frequency-Time at barscale (BFT).
Must be used for SSAE and the computtion of Barwise TF matrix.
Each bar in the tensor-spectrogram contains the same number of frames, define by the "subdivision" parameter.
These frames are selected from an oversampled spectrogram, adapting to the specific size of each bar.
See [1] for details.
Parameters
----------
spectrogram : list of list of floats or numpy array
The spectrogram to return as a tensor-spectrogram.
bars : list of tuples
List of the bars (start, end), in seconds, to cut the spectrogram at bar delimitation.
hop_length_seconds : float
The hop_length, in seconds.
subdivision : integer
The number of subdivision of the bar to be contained in each slice of the tensor.
Returns
-------
np.array tensor
The tensor-spectrogram as a np.array.
"""
barwise_spec = []
bars_idx = dm.segments_from_time_to_frame_idx(bars[1:], hop_length_seconds)
if subset_nb_bars is not None:
bars_idx = bars_idx[:subset_nb_bars]
for idx, beats in enumerate(bars_idx):
t_0 = beats[0]
t_1 = beats[1]
samples = [int(round(t_0 + k * (t_1 - t_0)/subdivision)) for k in range(subdivision)]
if len(samples) != len(set(samples)): # Check for repetitions
if idx != len(bars_idx) - 1: # It's not a problem if it's the last bar, because it is due to inconsistencies between the last downbeat estimated and the end of the song.
raise err.ToDebugException("The subdivision is too large, it leads to repeated samples chosen in the bar!")
if samples[-1] < spectrogram.shape[1]:
barwise_spec.append(spectrogram[:,samples])
return np.array(barwise_spec)
def tensorize_barwise_FTB(spectrogram, bars, hop_length_seconds, subdivision, subset_nb_bars = None):
#(careful: different mode organization than previous one: here, this is Frequency-Time-Bars)
"""
Returns a 3rd order tensor-spectrogram from the original spectrogram and bars starts and ends.
The order of modes is Frequency-Time at barscale-Bars (FTB).
Must be used for NTD.
Each bar in the tensor-spectrogram contains the same number of frames, define by the "subdivision" parameter.
These frames are selected from an oversampled spectrogram, adapting to the specific size of each bar.
See [1, Chap 2.4.2] for details.
Parameters
----------
spectrogram : list of list of floats or numpy array
The spectrogram to return as a tensor-spectrogram.
bars : list of tuples
List of the bars (start, end), in seconds, to cut the spectrogram at bar delimitation.
hop_length_seconds : float
The hop_length, in seconds.
subdivision : integer
The number of subdivision of the bar to be contained in each slice of the tensor.
Returns
-------
tensorly tensor
The tensor-spectrogram as a tensorly tensor.
"""
freq_len = spectrogram.shape[0]
bars_idx = dm.segments_from_time_to_frame_idx(bars[1:], hop_length_seconds)
if subset_nb_bars is not None:
bars_idx = bars_idx[:subset_nb_bars]
samples_init = [int(round(bars_idx[0][0] + k * (bars_idx[0][1] - bars_idx[0][0])/subdivision)) for k in range(subdivision)]
tens = np.array(spectrogram[:,samples_init]).reshape(freq_len, subdivision, 1)
for bar in bars_idx[1:]:
t_0 = bar[0]
t_1 = bar[1]
samples = [int(round(t_0 + k * (t_1 - t_0)/subdivision)) for k in range(subdivision)]
if samples[-1] < spectrogram.shape[1]:
current_bar_tensor_spectrogram = spectrogram[:,samples].reshape(freq_len, subdivision,1)
tens = np.append(tens, current_bar_tensor_spectrogram, axis = 2)
else:
break
return tl.tensor(tens)#, dtype=tl.float32)
# %% Tensors to spectrograms
def tensor_barwise_to_spectrogram(tensor, mode_order = "BFT", subset_nb_bars = None):
if subset_nb_bars is not None:
tensor = barwise_subset_this_tensor(tensor, subset_nb_bars, mode_order = mode_order)
if mode_order == "BFT":
return tl.unfold(tensor, 1)
elif mode_order == "FTB":
return np.reshape(tensor, (tensor.shape[0], tensor.shape[1] * tensor.shape[2]), order = 'F') # Note: it is NOT the same than unfold(0)
else:
raise err.InvalidArgumentValueException(f"Unknown mode order: {mode_order}.")
def barwise_subset_this_tensor(tensor, subset_nb_bars, mode_order = "BFT"):
if mode_order == "BFT":
return tensor[:subset_nb_bars]
elif mode_order == "FTB":
return tensor[:,:,:subset_nb_bars]
else:
raise err.InvalidArgumentValueException(f"Unknown mode order: {mode_order}.")
def get_this_bar_tensor(tensor, bar_idx, mode_order = "BFT"):
if mode_order == "BFT":
return tensor[bar_idx]
elif mode_order == "FTB":
return tensor[:,:,bar_idx]
else:
raise err.InvalidArgumentValueException(f"Unknown mode order: {mode_order}.")
# %% Spectrogram to Barwise TF matrix
def barwise_TF_matrix(spectrogram, bars, hop_length_seconds, subdivision, subset_nb_bars = None):
"""
Barwise TF matrix, a 2D representation of Barwise spectrograms as Time-Frequency vectors.
See [1] for details.
Parameters
----------
spectrogram : list of list of floats or numpy array
The spectrogram to return as a tensor-spectrogram.
bars : list of tuples
List of the bars (start, end), in seconds, to cut the spectrogram at bar delimitation.
hop_length_seconds : float
The hop_length, in seconds.
subdivision : integer
The number of subdivision of the bar to be contained in each slice of the tensor.
Returns
-------
np.array
The Barwise TF matrix, of sizes (b, tf).
"""
tensor_spectrogram = tensorize_barwise_BFT(spectrogram, bars, hop_length_seconds, subdivision, subset_nb_bars=subset_nb_bars)
return tl.unfold(tensor_spectrogram, 0)
def barwise_subset_this_TF_matrix(matrix, subset_nb_bars):
assert subset_nb_bars is not None
return matrix[:subset_nb_bars]
# %% Vector and Barwise TF to spectrogram
def TF_vector_to_spectrogram(vector, frequency_dimension, subdivision):
"""
Encapsulating the conversion from a Time-Frequency vector to a Time-Frequency matrix (spectrogram)
Parameters
----------
vector : np.array
A Time-Frequency vector (typically a row in the Barwise TF matrix).
frequency_dimension : positive integer
The size of the frequency dimension
(number of components in this dimension).
subdivision : positive integer
The size of the time dimension at the bar scale
(number of time components in each bar, defined as parameter when creating the Barwise TF matrix).
Returns
-------
np.array
A Time-Frequency matrix (spectrogram) of size (frequency_dimension, subdivision).
"""
assert frequency_dimension*subdivision == vector.shape[0]
return tl.fold(vector, 0, (frequency_dimension,subdivision))
def TF_matrix_to_spectrogram(matrix, frequency_dimension, subdivision, subset_nb_bars = None):
spectrogram_content = None
if subset_nb_bars is not None:
matrix = barwise_subset_this_TF_matrix(matrix, subset_nb_bars)
for tf_vector in matrix:
bar_content = TF_vector_to_spectrogram(tf_vector, frequency_dimension, subdivision)
spectrogram_content = np.concatenate((spectrogram_content, bar_content), axis=1) if spectrogram_content is not None else bar_content
return spectrogram_content
# Tensor to Barwise TF
def tensor_barwise_to_barwise_TF(tensor, mode_order = "BFT"):
# Barmode: 0 for BTF, 2 for FTB
if mode_order == "BFT": # Checked
return tl.unfold(tensor, 0)
elif mode_order == "FTB": # Checked
return tl.unfold(tensor, 2)
else:
raise err.InvalidArgumentValueException(f"Unknown mode order: {mode_order}.")
# %% Barwise TF to tensor
# TODO
# Beatwise MSAF
def beat_synchronize_msaf(spectrogram, frame_times, beat_frames, beat_times):
# Make beat synchronous
beatsync_feats = librosa.util.utils.sync(spectrogram.T, beat_frames, pad=True).T
# Assign times (and add last time if padded)
beatsync_times = np.copy(beat_times)
if beatsync_times.shape[0] != beatsync_feats.shape[0]:
beatsync_times = np.concatenate((beatsync_times,
[frame_times[-1]]))
return beatsync_feats, beatsync_times
......@@ -49,22 +49,14 @@ def get_bars_from_audio(song_path):
act = dbt.RNNDownBeatProcessor()(song_path)
proc = dbt.DBNDownBeatTrackingProcessor(beats_per_bar=[3,4], fps=100)
song_beats = proc(act)
downbeats_times = []
downbeats_times = [song_beats[0][0]]
if song_beats[0][1] != 1: # Adding a first downbeat at the start of the song
downbeats_times.append(0.1)
for beat in song_beats:
for beat in song_beats[1:]: # The first beat is already added
if beat[1] == 1: # If the beat is a downbeat
downbeats_times.append(beat[0])
# The following block of code artificially adds bars to the end of the song, in order to span the total song length.
# It seems like a good idea initially but may be detrimental, and should be debated anyway.
average_bar_length = np.mean([downbeats_times[i + 1] - downbeats_times[i] for i in range(len(downbeats_times) - 1)]) # average bar length in the song
song_length = act.shape[0]/100 # Total length of the song
while downbeats_times[-1] + 1.1*average_bar_length < song_length: # As long as the bar estimation does not cover the entire song
downbeats_times.append(round(downbeats_times[-1] + average_bar_length, 2)) # artifically adds bars of the length of the average bar length in the song
downbeats_times.append(song_length) # adding the last downbeat
return frontiers_to_segments(downbeats_times)
def get_beats_from_audio_msaf(signal, sr, hop_length):
......@@ -91,8 +83,8 @@ def get_beats_from_audio_madmom(song_path):
act = bt.TCNBeatProcessor()(song_path)
proc = bt.BeatTrackingProcessor(fps=100)
song_beats = proc(act)
beats_times = []
# beats_times = []
# if song_beats[0][1] != 1: # Adding a first downbeat at the start of the song
# beats_times.append(0.1)
# for beat in song_beats:
......@@ -435,7 +427,39 @@ def align_frontiers_on_bars(frontiers, bars):
else:
frontiers_on_bars.append(bars[i][0])
return frontiers_on_bars
def get_median_hop(bars, subdivision = 96, sampling_rate = 44100):
"""
Returns the median hop length in the song, used for audio reconstruction.
The rationale is that all bars are sampled with 'subdivision' number of frames,
but they can be of different lengths in absolute time.
Hence, the time gap between two consecutive frames (the hop length) can differ between bars.
For reconstruction, we use the median hop length among all bars.
Parameters
----------
bars : list of tuples of float
The bars, as (start time, end time) tuples.
subdivision : integer, optional
The number of subdivision of the bar to be contained in each slice of the tensor.
The default is 96.
sampling_rate : integer, optional
The sampling rate of the signal, in Hz.
The default is 44100.
Returns
-------
integer
The median hop length in these bars.
"""
hops = []
for bar_idx in range(1, len(bars)):
len_sig = bars[bar_idx][1] - bars[bar_idx][0]
hop = int(len_sig/subdivision * sampling_rate)
hops.append(hop)
return int(np.median(hops)) # Generally used for audio reconstruction
# %% Sonification of the segmentation
def sonify_frontiers_path(audio_file_path, frontiers_in_seconds, output_path):
"""
......@@ -559,14 +583,14 @@ def compute_score_of_segmentation(reference, segments_in_time, window_length = 0
ref_intervals, useless = mir_eval.util.adjust_intervals(reference,t_min=0)
est_intervals, useless = mir_eval.util.adjust_intervals(np.array(segments_in_time), t_min=0, t_max=ref_intervals[-1, 1])
try:
return mir_eval.segment.detection(ref_intervals, est_intervals, window = window_length, trim = False)
return mir_eval.segment.detection(ref_intervals, est_intervals, window = window_length, trim = True)
except ValueError:
cleaned_intervals = []
#print("A segment is (probably) composed of the same start and end. Can happen with time -> bar -> time conversion, but should'nt happen for data originally segmented in bars.")
for idx in range(len(est_intervals)):
if est_intervals[idx][0] != est_intervals[idx][1]:
cleaned_intervals.append(est_intervals[idx])
return mir_eval.segment.detection(ref_intervals, np.array(cleaned_intervals), window = window_length, trim = False)
return mir_eval.segment.detection(ref_intervals, np.array(cleaned_intervals), window = window_length, trim = True)
def compute_median_deviation_of_segmentation(reference, segments_in_time):
"""
......
......@@ -18,7 +18,7 @@ import librosa
import mirdata
# Module encapsulating the computation of features from the signal
import as_seg.model.features as features
import as_seg.model.signal_to_spectrogram as signal_to_spectrogram
# General module for manipulating data: conversion between time, bars, frame indexes, loading of data, ...
import as_seg.data_manipulation as dm
......@@ -36,7 +36,7 @@ import as_seg.CBM_algorithm as cbm
from as_seg.model.current_plot import *
# %% Loading annotations and defining the audio path
path_to_beatles_dataset = 'C:/Users/amarmore/this_folder/Beatles dataset/' # To change
path_to_beatles_dataset = '/home/a23marmo/datasets/beatles' # To change
beatles = mirdata.initialize('beatles', path_to_beatles_dataset)
beatles.download()
......@@ -56,7 +56,7 @@ hop_length = 32 # Oversampling the spectrogram, to select frames which will be e
hop_length_seconds = hop_length/sampling_rate # As bars are in seconds, we convert this hop length in seconds.
subdivision_bars = 96 # The number of time samples to consider in each bar.
log_mel = features.get_spectrogram(the_signal, sampling_rate, "log_mel_grill", hop_length = hop_length) # Log_mel spectrogram
log_mel = signal_to_spectrogram.get_spectrogram(the_signal, sampling_rate, "log_mel", hop_length = hop_length) # Log_mel spectrogram
# %% Cosine autosimilarity
barwise_TF_cosine = bi.barwise_TF_matrix(log_mel, bars, hop_length_seconds, subdivision_bars)
......
import mirdata
import librosa
import as_seg.model.signal_to_spectrogram as signal_to_spectrogram
import pathlib
import shutil
import numpy as np
import warnings
import as_seg
eps = 1e-10
class BaseDataloader():
def __init__(self, feature, cache_path = None, sr=44100, hop_length = 32, subdivision = 96, verbose = False):
self.cache_path = cache_path
self.verbose = verbose
self.sr = sr
self.feature = feature
self.hop_length = hop_length
# For barwise or beatwise processing
self.subdivision = subdivision
self.frequency_dimension = signal_to_spectrogram.get_default_frequency_dimension(feature) # Risky, because it is not linked to the computation. Should be computed from the spectrogram.
def __getitem__(self, index):
raise NotImplementedError("This method should be implemented in the child class") from None
def __len__(self):
raise NotImplementedError("This method should be implemented in the child class") from None
def get_spectrogram(self, signal): # The spectrogram is not saved in the cache because it is too large in general
return signal_to_spectrogram.get_spectrogram(signal, self.sr, self.feature, self.hop_length)
def get_bars(self, audio_path, index = None):
def _compute_bars():
return as_seg.data_manipulation.get_bars_from_audio(audio_path)
if self.cache_path is not None:
if index is None:
warnings.warn("No index provided for the cache, the cache will be ignored")
else:
dir_save_bars_path = f"{self.cache_path}/bars"
try:
bars = np.load(f"{dir_save_bars_path}/{index}.npy", allow_pickle=True)
if self.verbose:
print("Using cached bars.")
except FileNotFoundError:
bars = _compute_bars()
pathlib.Path(dir_save_bars_path).mkdir(parents=True, exist_ok=True)
np.save(f"{dir_save_bars_path}/{index}.npy", bars)
return bars
return _compute_bars()
def get_barwise_tf_matrix(self, track_path, bars, index = None):
def _compute_barwise_tf_matrix():
# Load the signal of the song
sig, _ = librosa.load(track_path, sr=self.sr, mono=True) #torchaudio.load(track.audio_path)
# Compute the spectrogram
spectrogram = self.get_spectrogram(sig)
return as_seg.barwise_input.barwise_TF_matrix(spectrogram, bars, self.hop_length/self.sr, self.subdivision) + eps
if self.cache_path is not None:
if index is None:
warnings.warn("No index provided for the cache, the cache will be ignored")
else:
cache_file_name = f"{index}_{self.feature}_subdiv{self.subdivision}"
dir_save_barwise_tf_path = f"{self.cache_path}/barwise_tf_matrix"
try:
barwise_tf_matrix = np.load(f"{dir_save_barwise_tf_path}/{cache_file_name}.npy", allow_pickle=True)
if self.verbose:
print("Using cached Barwise TF matrix.")
except FileNotFoundError:
barwise_tf_matrix = _compute_barwise_tf_matrix()
pathlib.Path(dir_save_barwise_tf_path).mkdir(parents=True, exist_ok=True)
np.save(f"{dir_save_barwise_tf_path}/{cache_file_name}.npy", barwise_tf_matrix)
return barwise_tf_matrix
return _compute_barwise_tf_matrix()
def save_segments(self, segments, name):
# mirdata_segments = mirdata.annotations.SectionData(intervals=segments, interval_unit="s")
# jams_segments = mirdata.jams_utils.sections_to_jams(mirdata_segments)
dir_save_path = f"{self.data_path}/estimations/segments/{self.dataset_name.lower()}"
pathlib.Path(dir_save_path).mkdir(parents=True, exist_ok=True)
np.save(f"{dir_save_path}/{name}.npy", segments)
def score_flat_segmentation(self, segments, annotations):
close_tolerance = as_seg.data_manipulation.compute_score_of_segmentation(annotations, segments, window_length=0.5)
large_tolerance = as_seg.data_manipulation.compute_score_of_segmentation(annotations, segments, window_length=3)
return close_tolerance, large_tolerance
def segments_from_bar_to_seconds(self, segments, bars):
# May be useful, if ever.
return as_seg.data_manipulation.segments_from_bar_to_time(segments, bars)
class RWCPopDataloader(BaseDataloader):
def __init__(self, path, feature, cache_path = None, download=False, sr=44100, hop_length = 32, subdivision = 96):
super().__init__(feature, cache_path, sr, hop_length, subdivision)
self.data_path = path
rwcpop = mirdata.initialize('rwc_popular', data_home = path)
if download:
rwcpop.download()
self.all_tracks = rwcpop.load_tracks()
self.indexes = rwcpop.track_ids
self.dataset_name = "RWCPop"
def __getitem__(self, index):
track_id = self.indexes[index]
track = self.all_tracks[track_id]
# Compute the bars
bars = self.get_bars(track.audio_path, index=track_id)
# Compute the barwise TF matrix
barwise_tf_matrix = self.get_barwise_tf_matrix(track.audio_path, bars, index=track_id)
# Get the annotationks
annotations_intervals = track.sections.intervals
# Return the the bars, the barwise TF matrix and the annotations
return track_id, bars, barwise_tf_matrix, annotations_intervals
def __len__(self):
return len(self.indexes)
def get_track_of_id(self, track_id):
index = self.indexes.index(track_id)
return self.__getitem__[index]
def format_dataset(self, path_audio_files):
# Copy audio files to the right location.
# Suppose that the audio files are all in the same folder
for track_num in range(len(self.all_tracks)):
track_idx = self.indexes[track_num]
song_file_name = self.all_tracks[track_idx].audio_path.split('/')[-1]
src = f"{path_audio_files}/{song_file_name}" # May change depending on your file structure
dest = self.all_tracks[track_idx].audio_path
pathlib.Path(dest).parent.absolute().mkdir(parents=True, exist_ok=True)
shutil.copy(src, dest)
class SALAMIDataloader(BaseDataloader):
def __init__(self, path, feature, cache_path = None, download=False, subset = None, sr=44100, hop_length = 32, subdivision = 96):
super().__init__(feature, cache_path, sr, hop_length, subdivision)
self.dataset_name = "SALAMI"
self.data_path = path
salami = mirdata.initialize('salami', data_home = path)
if download:
salami.download()
self.all_tracks = salami.load_tracks()
self.indexes = salami.track_ids
self.subset = subset
if subset is not None:
train_indexes, test_indexes = self.split_training_test()
if subset == "train":
self.indexes = train_indexes
elif subset == "test":
self.indexes = test_indexes
elif subset == "debug":
self.indexes = test_indexes[:4]
else:
raise ValueError("Subset should be either 'train' or 'test'")
def __getitem__(self, index):
# Parsing through files ordered with self.indexes
track_id = self.indexes[index]
track = self.all_tracks[track_id]
try:
# Compute the bars
bars = self.get_bars(track.audio_path, index=track_id)
# Compute the barwise TF matrix
barwise_tf_matrix = self.get_barwise_tf_matrix(track.audio_path, bars, index=track_id)
# Get the annotations
dict_annotations = self.get_annotations(track)
# Return the the bars, the barwise TF matrix and the annotations
return track_id, bars, barwise_tf_matrix, dict_annotations
except FileNotFoundError:
print(f'{track_id} not found.')
return track_id, None, None, None
# raise FileNotFoundError(f"Song {track_id} not found, normal ?") from None
def __len__(self):
# To handle the fact that indexes are updated with the subset
return len(self.indexes)
def get_track_of_id(self, track_id):
try:
index = self.indexes.index(track_id)
except ValueError:
try:
index = self.indexes.index(str(track_id))
except ValueError:
raise ValueError(f"Track {track_id} not found in the dataset") from None
return self.__getitem__(index)
def get_annotations(self, track):
dict_annotations = {}
try:
# Trying to get the first annotator
dict_annotations["upper_level_annotations"] = track.sections_annotator_1_uppercase.intervals
dict_annotations["lower_level_annotations"] = track.sections_annotator_1_lowercase.intervals
try: # Trying to load the second annotator
dict_annotations["upper_level_annotations_2"] = track.sections_annotator_2_uppercase.intervals
dict_annotations["lower_level_annotations_2"] = track.sections_annotator_2_lowercase.intervals
dict_annotations["annot_number"] = 2
except AttributeError: # Only the first annotator was loaded
dict_annotations["annot_number"] = 1
except AttributeError:
try:
# Trying to get the second annotator (no first one)
dict_annotations["upper_level_annotations"] = track.sections_annotator_2_uppercase.intervals
dict_annotations["lower_level_annotations"] = track.sections_annotator_2_lowercase.intervals
dict_annotations["annot_number"] = 1
except AttributeError:
raise AttributeError(f"No annotations found for {track.track_id}")
return dict_annotations
def get_this_set_annotations(self, dict_annotations, annotation_level = "upper", annotator = 1):
if annotator == 1:
if annotation_level == "upper":
annotations = dict_annotations["upper_level_annotations"]
elif annotation_level == "lower":
annotations = dict_annotations["lower_level_annotations"]
else:
raise ValueError("Invalid annotation level")
elif annotator == 2:
assert dict_annotations["annot_number"] == 2, "No second annotator found."
if annotation_level == "upper":
annotations = dict_annotations["upper_level_annotations"]
elif annotation_level == "lower":
annotations = dict_annotations["lower_level_annotations"]
else:
raise ValueError("Invalid annotation level")
# elif annotator == "both":
# assert dict_annotations["annot_number"] == 2, "No second annotator found."
# annotations = dict_annotations["upper_level_annotations"] + dict_annotations["upper_level_annotations_2"]
else:
raise ValueError("Invalid annotator number")
return annotations
def split_training_test(self):
indexes_train = []
indexes_test = []
for track_id in self.indexes:
track = self.all_tracks[track_id]
try:
track.sections_annotator_1_uppercase.intervals
track.sections_annotator_2_uppercase.intervals
indexes_test.append(track_id)
except AttributeError:
indexes_train.append(track_id)
return indexes_train, indexes_test
def score_flat_segmentation(self, segments, dict_annotations, annotation_level = "upper", annotator = 1):
if annotator == "both":
assert dict_annotations["annot_number"] == 2, "No second annotator found."
score_annot_1 = self.score_flat_segmentation(segments, dict_annotations, annotation_level = annotation_level, annotator = 1)
score_annot_2 = self.score_flat_segmentation(segments, dict_annotations, annotation_level = annotation_level, annotator = 2)
return score_annot_1, score_annot_2
annotations = self.get_this_set_annotations(dict_annotations, annotation_level = annotation_level, annotator = annotator)
return super().score_flat_segmentation(segments, annotations)
def score_flat_segmentation_twolevel(self, segments_upper_level, segments_lower_level, dict_annotations, annotator = 1):
score_upper_level = self.score_flat_segmentation(segments_upper_level, dict_annotations, annotation_level = "upper", annotator = annotator)
score_lower_level = self.score_flat_segmentation(segments_lower_level, dict_annotations, annotation_level = "lower", annotator = annotator)
return score_upper_level, score_lower_level
def score_flat_segmentation_twolevel_best_of_several(self, list_segments_upper_level, list_segments_lower_level, dict_annotations, annotator = 1):
assert annotator != "both", "Not implemented yet"
stack_upper_scores = -np.inf * np.ones((len(list_segments_upper_level),2,3))
for idx, segments in enumerate(list_segments_upper_level):
stack_upper_scores[idx] = self.score_flat_segmentation(segments, dict_annotations, annotation_level = "upper", annotator = annotator)
idx_close = np.argmax(stack_upper_scores[:,0,2]) # Selecting best f measure at 0.5s
idx_large = np.argmax(stack_upper_scores[:,1,2]) # Selecting best f measure at 3s
score_upper_level = (stack_upper_scores[idx_close,0,:], stack_upper_scores[idx_large,1,:])
stack_lower_scores = -np.inf * np.ones((len(list_segments_lower_level),2,3))
for idx, segments in enumerate(list_segments_lower_level):
stack_lower_scores[idx] = self.score_flat_segmentation(segments, dict_annotations, annotation_level = "lower", annotator = annotator)
idx_close = np.argmax(stack_lower_scores[:,0,2]) # Selecting best f measure at 0.5s
idx_large = np.argmax(stack_lower_scores[:,1,2]) # Selecting best f measure at 3s
score_lower_level = (stack_lower_scores[idx_close,0,:], stack_lower_scores[idx_large,1,:])
return score_upper_level, score_lower_level
def get_sizes_of_annotated_segments(self, annotation_level = "upper", annotator = 1, plot = False):
lengths = []
for track_id in self.indexes:
track = self.all_tracks[track_id]
try:
# Compute the bars
bars = self.get_bars(track.audio_path, index=track_id)
# Get the annotations
dict_annotations = self.get_annotations(track)
annotations = self.get_this_set_annotations(dict_annotations, annotation_level = annotation_level, annotator = annotator)
barwise_annot = as_seg.data_manipulation.frontiers_from_time_to_bar(np.array(annotations)[:,1], bars) # Convert the annotations from time to bar
for i in range(len(barwise_annot) - 1):
lengths.append(barwise_annot[i+1] - barwise_annot[i]) # Store the length of the annotated segment
except FileNotFoundError:
print(f'{track_id} not found.')
# raise FileNotFoundError(f"Song {track_id} not found, normal ?") from None
if plot:
as_seg.model.current_plot.plot_lenghts_hist(lengths)
return lengths
# def format_dataset(self, path_audio_files): # TODO
# # Copy audio files to the right location.
# # Suppose that the audio files are all in the same folder
# for track_num in range(len(self.all_tracks)):
# track_idx = self.indexes[track_num]
# song_file_name = self.all_tracks[track_idx].audio_path.split('/')[-1]
# src = f"{path_audio_files}/{song_file_name}" # May change depending on your file structure
# dest = self.all_tracks[track_idx].audio_path
# pathlib.Path(dest).parent.absolute().mkdir(parents=True, exist_ok=True)
# shutil.copy(src, dest)
class BeatlesDataloader(BaseDataloader):
def __init__(self, path, feature, cache_path = None, download=False, sr=44100, hop_length = 32, subdivision = 96):
super().__init__(feature, cache_path, sr, hop_length, subdivision)
self.data_path = path
beatles = mirdata.initialize('beatles', data_home = path)
if download:
beatles.download()
self.all_tracks = beatles.load_tracks()
self.indexes = beatles.track_ids
self.dataset_name = "Beatles"
def __getitem__(self, index):
track_id = self.indexes[index]
track = self.all_tracks[track_id]
# Compute the bars
bars = self.get_bars(track.audio_path, index=track_id)
# Compute the barwise TF matrix
barwise_tf_matrix = self.get_barwise_tf_matrix(track.audio_path, bars, index=track_id)
# Get the annotationks
annotations_intervals = track.sections.intervals
# Return the the bars, the barwise TF matrix and the annotations
return track_id, bars, barwise_tf_matrix, annotations_intervals
def __len__(self):
return len(self.all_tracks)
def get_track_of_id(self, track_id):
try:
index = self.indexes.index(track_id)
except ValueError:
try:
index = self.indexes.index(str(track_id))
except ValueError:
raise ValueError(f"Track {track_id} not found in the dataset") from None
return self.__getitem__(index)
if __name__ == "__main__":
# rwcpop = RWCPopDataloader('/home/a23marmo/datasets/rwcpop', feature = "mel", cache_path = "/home/a23marmo/Bureau/data_persisted/rwcpop")
# # rwcpop.format_dataset('/home/a23marmo/Bureau/Audio samples/rwcpop/Entire RWC')
# for spectrogram, bars, barwise_tf_matrix, track_id, annotations in rwcpop:
# print(spectrogram.shape, track_id)
salami = SALAMIDataloader('/home/a23marmo/datasets/salami', feature = "mel", cache_path = "/home/a23marmo/Bureau/data_persisted/salami", subset = "train")
for spectrogram, bars, barwise_tf_matrix, track_id, annotations in salami:
try:
print(track_id)
except FileNotFoundError:
print(f'{track_id} not found.')
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 25 16:54:59 2020
@author: amarmore
Computing spectrogram in different feature description.
Note that Mel (and variants of Mel) spectrograms are denoted "mel_grill",
as they follow the particular definition of [1].
[1] Grill, T., & Schlüter, J. (2015, October).
Music Boundary Detection Using Neural Networks on Combined Features and Two-Level Annotations.
In ISMIR (pp. 531-537).
"""
import numpy as np
import librosa.core
import librosa.feature
import librosa.effects
from math import inf
import as_seg.model.errors as err
import IPython.display as ipd
def get_spectrogram(signal, sr, feature, hop_length, n_fft = 2048, fmin = 98, n_mfcc = 20):
"""
Returns a spectrogram, from the signal of a song.
Different types of spectrogram can be computed, which are specified by the argument "feature".
All these spectrograms are computed with the toolbox librosa [1].
Parameters
----------
signal : numpy array
Signal of the song.
sr : float
Sampling rate of the signal, (typically 44100Hz).
feature : String
The types of spectrograms to compute.
- stft : computes the Short-Time Fourier Transform of the signal.
Returns the Power spectrogram.
- pcp : computes a chromagram.
NB: this chromagram has been specificly fitted as a team,
and the arguments are non standard but rather technical choices.
- pcp_stft : computes a chromagram from the stft of the song.
- cqt : computes a Constant-Q transform of the song.
- log_cqt : computes the logarithm of the Constant-Q transform of the song.
- tonnetz : computes the tonnetz representation of the song.
- pcp_tonnetz : computes the tonnetz representation of the song, starting from the chromas.
It allows us to better control paramaters over the computation of tonnetz,
and can reduce computation when chromas are already computed (for scripts loading already computed spectrograms).
- mfcc : computes the Mel-Frequency Cepstral Coefficients of the song.
- mel_grill : computes the mel-spectrogram of the song, as dimensioned by [2].
- log_mel_grill : computes the logarithm of the previously defined mel-spectrogram.
- pos_log_mel_grill : computes the log(mel + 1) of the previously defined mel-spectrogram.
hop_length : integer
The desired hop_length, which is the step between two frames (ie the time "discretization" step)
It is expressed in terms of number of samples, which are defined by the sampling rate.
n_fft : integer, optional
Number of frames by stft feature.
The default is 2048.
fmin : integer, optional
The minimal frequence to consider, used for denoising.
The default is 98.
n_mfcc : integer, optional
Number of mfcc features.
The default is 20 (as in librosa).
Raises
------
InvalidArgumentValueException
If the "feature" argument is not presented above.
Returns
-------
numpy array
Spectrogram of the signal.
References
----------
[1] McFee, B., Raffel, C., Liang, D., Ellis, D. P., McVicar, M., Battenberg, E., & Nieto, O. (2015, July).
librosa: Audio and music signal analysis in python.
In Proceedings of the 14th python in science conference (Vol. 8).
[2] Grill, T., & Schlüter, J. (2015, October).
Music Boundary Detection Using Neural Networks on Combined Features and Two-Level Annotations.
In ISMIR (pp. 531-537).
"""
if feature.lower() == "stft":
if len(signal.shape) == 1:
stft = librosa.core.stft(y=np.asfortranarray(signal), n_fft=n_fft, hop_length = hop_length)
power_spectrogram = np.abs(stft) ** 2
return power_spectrogram
power_spectrogram = np.abs(librosa.core.stft(y=np.asfortranarray(signal[:,0]), n_fft=n_fft, hop_length = hop_length))**2
for i in range(1,signal.shape[1]):
power_spectrogram += np.abs(librosa.core.stft(y=np.asfortranarray(signal[:,i]), n_fft=n_fft, hop_length = hop_length))**2
return power_spectrogram
elif feature.lower() == "pcp_stft":
if len(signal.shape) == 1:
audio_harmonic, _ = librosa.effects.hpss(y=np.asfortranarray(signal))
chroma_stft = librosa.feature.chroma_stft(y=audio_harmonic, sr=sr, n_fft = n_fft, hop_length=hop_length)
return chroma_stft
audio_harmonic, _ = librosa.effects.hpss(y=np.asfortranarray(signal[:,0]))
chroma_stft = librosa.feature.chroma_stft(y=audio_harmonic, sr=sr, n_fft = n_fft, hop_length=hop_length)
for i in range(1,signal.shape[1]):
audio_harmonic, _ = librosa.effects.hpss(y=np.asfortranarray(signal[:,i]))
chroma_stft += librosa.feature.chroma_stft(y=audio_harmonic, sr=sr, n_fft = n_fft, hop_length=hop_length)
return chroma_stft
elif feature == "pcp":
norm=inf # Columns normalization
win_len_smooth=82 # Size of the smoothign window
n_octaves=6
bins_per_chroma = 3
bins_per_octave=bins_per_chroma * 12
if len(signal.shape) == 1:
return librosa.feature.chroma_cens(y=np.asfortranarray(signal),sr=sr,hop_length=hop_length,
fmin=fmin, n_chroma=12, n_octaves=n_octaves, bins_per_octave=bins_per_octave,
norm=norm, win_len_smooth=win_len_smooth)
pcp = librosa.feature.chroma_cens(y=np.asfortranarray(signal[:,0]),sr=sr,hop_length=hop_length,
fmin=fmin, n_chroma=12, n_octaves=n_octaves, bins_per_octave=bins_per_octave,
norm=norm, win_len_smooth=win_len_smooth)
for i in range(1,signal.shape[1]):
pcp += librosa.feature.chroma_cens(y=np.asfortranarray(signal[:,i]),sr=sr,hop_length=hop_length,
fmin=fmin, n_chroma=12, n_octaves=n_octaves, bins_per_octave=bins_per_octave,
norm=norm, win_len_smooth=win_len_smooth)
return pcp
elif feature.lower() == "cqt":
if len(signal.shape) == 1:
constant_q_transf = librosa.core.cqt(y=np.asfortranarray(signal), sr = sr, hop_length = hop_length)
power_cqt = np.abs(constant_q_transf) ** 2
return power_cqt
power_cqt = np.abs(librosa.core.cqt(y=np.asfortranarray(signal[:,0]), sr = sr, hop_length = hop_length)) ** 2
for i in range(1,signal.shape[1]):
power_cqt += np.abs(librosa.core.cqt(y=np.asfortranarray(signal[:,i]), sr = sr, hop_length = hop_length)) ** 2
return power_cqt
elif feature.lower() == "log_cqt":
if len(signal.shape) == 1:
constant_q_transf = librosa.core.cqt(y=np.asfortranarray(signal), sr = sr, hop_length = hop_length)
power_cqt = np.abs(constant_q_transf) ** 2
log_cqt = ((1.0/80.0) * librosa.core.amplitude_to_db(y=np.abs(np.array(power_cqt)), ref=np.max)) + 1.0
return log_cqt
power_cqt = np.abs(librosa.core.cqt(y=np.asfortranarray(signal[:,0]), sr = sr, hop_length = hop_length)) ** 2
for i in range(1,signal.shape[1]):
power_cqt += np.abs(librosa.core.cqt(y=np.asfortranarray(signal[:,i]), sr = sr, hop_length = hop_length)) ** 2
log_cqt = ((1.0/80.0) * librosa.core.amplitude_to_db(y=np.abs(np.array(power_cqt)), ref=np.max)) + 1.0
return log_cqt
elif feature.lower() == "tonnetz":
if len(signal.shape) == 1:
return librosa.feature.tonnetz(y=np.asfortranarray(signal), sr = sr)
tonnetz = librosa.feature.tonnetz(y=np.asfortranarray(signal[:,0]), sr = sr)
for i in range(1,signal.shape[1]):
tonnetz += librosa.feature.tonnetz(y=np.asfortranarray(signal[:,i]), sr = sr)
return tonnetz
elif feature.lower() == "pcp_tonnetz":
return librosa.feature.tonnetz(y=None, sr = None, chroma = get_spectrogram(signal, sr, "pcp", hop_length, fmin = fmin))
# elif feature.lower() == "hcqt":
# return my_compute_hcqt(np.asfortranarray(signal[:,0]), sr)
elif feature.lower() == "mfcc":
if len(signal.shape) == 1:
return librosa.feature.mfcc(y=np.asfortranarray(signal), sr = sr, hop_length = hop_length, n_mfcc=n_mfcc)
mfcc = librosa.feature.mfcc(y=np.asfortranarray(signal[:,0]), sr = sr, hop_length = hop_length, n_mfcc=n_mfcc)
for i in range(1,signal.shape[1]):
mfcc += librosa.feature.mfcc(y=np.asfortranarray(signal[:,i]), sr = sr, hop_length = hop_length, n_mfcc=n_mfcc)
return mfcc
# For Mel spectrograms, we use the same parameters as the ones of [2].
# [2] Grill, Thomas, and Jan Schlüter. "Music Boundary Detection Using Neural Networks on Combined Features and Two-Level Annotations." ISMIR. 2015.
elif feature.lower() == "mel_grill":
if len(signal.shape) == 1:
return np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000))
mel = np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal[:,0]), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000))
for i in range(1,signal.shape[1]):
mel += np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal[:,i]), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000))
return mel
elif feature == "log_mel_grill":
if len(signal.shape) == 1:
return librosa.power_to_db(np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000)))
mel = np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal[:,0]), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000))
for i in range(1,signal.shape[1]):
mel += np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal[:,i]), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000))
return librosa.power_to_db(mel)
elif feature == "nn_log_mel_grill":
if len(signal.shape) == 1:
mel = np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000))
return librosa.power_to_db(mel + np.ones(mel.shape))
mel = np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal[:,0]), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000))
for i in range(1,signal.shape[1]):
mel += np.abs(librosa.feature.melspectrogram(y=np.asfortranarray(signal[:,i]), sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000))
return librosa.power_to_db(mel + np.ones(mel.shape))
elif feature == "padded_log_mel_grill":
log_mel = get_spectrogram(signal, sr, "log_mel_grill", hop_length)
return log_mel - np.amin(log_mel) * np.ones(log_mel.shape)
elif feature == "mel" or feature == "log_mel" or feature == "nn_log_mel":
raise err.InvalidArgumentValueException("Invalid feature parameter, aren't you looking for mel_grill/log_mel_grill (the only available Mel Spectrograms)?")
else:
raise err.InvalidArgumentValueException(f"Unknown signal representation: {feature}.")
def get_log_mel_from_mel(mel_spectrogram, feature):
"""
Computes a variant of a Mel spectrogram (typically Log Mel).
Parameters
----------
mel_spectrogram : numpy array
Mel spectrogram of the signal.
feature : string
Desired feature name (must be a variant of a Mel spectrogram).
Raises
------
err.InvalidArgumentValueException
Raised in case of unknown feature name.
Returns
-------
numpy array
Variant of the Mel spectrogram of the signal.
"""
if feature == "log_mel_grill":
return librosa.power_to_db(np.abs(mel_spectrogram))
elif feature == "nn_log_mel_grill":
return librosa.power_to_db(mel_spectrogram + np.ones(mel_spectrogram.shape))
elif feature == "padded_log_mel_grill":
log_mel = get_log_mel_from_mel(mel_spectrogram, "log_mel_grill")
return log_mel - np.amin(log_mel) * np.ones(log_mel.shape)
elif feature == "minmax_log_mel_grill":
padded_log_mel = get_log_mel_from_mel(mel_spectrogram, "padded_log_mel_grill")
return np.divide(padded_log_mel, np.amax(padded_log_mel))
elif feature == "mel" or feature == "log_mel":
raise err.InvalidArgumentValueException("Invalid mel parameter, are't you looking for mel_grill?")
else:
raise err.InvalidArgumentValueException("Unknown feature representation.")
def get_audio_from_spectrogram(spectrogram, feature, hop_length, sr):
"""
Computes an audio signal for a COMPLEX-valued spectrogram.
Parameters
----------
spectrogram : numpy array
Complex-valued spectrogram.
feature : string
Name of the particular feature used for representing the signal in a spectrogram.
hop_length : int
Hop length of the spectrogram
(Or similar value for the reconstruction to make sense).
sr : inteer
Sampling rate of the signal, when processed into a spectrogram
(Or similar value for the reconstruction to make sense).
Raises
------
InvalidArgumentValueException
In case of an unknown feature representation.
Returns
-------
ipd.Audio
Audio signal of the spectrogram.
"""
if feature == "stft":
audio = librosa.griffinlim(spectrogram, hop_length = hop_length)
return ipd.Audio(audio, rate=sr)
elif feature == "mel_grill":
stft = librosa.feature.inverse.mel_to_stft(spectrogram, sr=sr, n_fft=2048, power=2.0, fmin=80.0, fmax=16000)
return get_audio_from_spectrogram(stft, "stft", hop_length, sr)
elif feature == "nn_log_mel_grill":
mel = librosa.db_to_power(spectrogram) - np.ones(spectrogram.shape)
return get_audio_from_spectrogram(mel, "mel_grill", hop_length, sr)
else:
raise err.InvalidArgumentValueException("Unknown feature representation, can't reconstruct a signal.")
# %% Implementation of PCP from MSAF (for baseline comparison)
def get_pcp_as_msaf(signal, sr, hop_length):
audio_harmonic, _ = librosa.effects.hpss(y=signal)
pcp_cqt = np.abs(librosa.hybrid_cqt(y=audio_harmonic,
sr=sr,
hop_length=hop_length,
n_bins=84,
norm=np.inf,
fmin=27.5)) ** 2
pcp = librosa.feature.chroma_cqt(C=pcp_cqt,
sr=sr,
hop_length=hop_length,
n_octaves=6,
fmin=27.5).T
frame_times = librosa.core.frames_to_time(np.arange(pcp.shape[0]), sr, hop_length)
return pcp, frame_times
def get_beatsync_pcp_as_msaf(signal, sr, hop_length):
audio_harmonic, audio_percussive = librosa.effects.hpss(y=signal)
pcp_cqt = np.abs(librosa.hybrid_cqt(y=audio_harmonic,
sr=sr,
hop_length=hop_length,
n_bins=84,
norm=np.inf,
fmin=27.5)) ** 2
pcp = librosa.feature.chroma_cqt(C=pcp_cqt,
sr=sr,
hop_length=hop_length,
n_octaves=6,
fmin=27.5).T
frame_times = librosa.core.frames_to_time(np.arange(pcp.shape[0]), sr, hop_length)
# Compute beats
_, beat_frames = librosa.beat.beat_track(y=audio_percussive, sr=sr, hop_length=hop_length)
# To times
beat_times = librosa.frames_to_time(beat_frames, sr=sr,hop_length=hop_length)
# TODO: Is this really necessary?
if len(beat_times) > 0 and beat_times[0] == 0:
beat_times = beat_times[1:]
beat_frames = beat_frames[1:]
# Make beat synchronous
beatsync_feats = librosa.util.utils.sync(pcp.T, beat_frames, pad=True).T
# Assign times (and add last time if padded)
beatsync_times = np.copy(beat_times)
if beatsync_times.shape[0] != beatsync_feats.shape[0]:
beatsync_times = np.concatenate((beatsync_times,
[frame_times[-1]]))
return beatsync_feats, beatsync_times
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 25 16:54:59 2020
@author: amarmore
Computing spectrogram in different feature description.
Note that Mel (and variants of Mel) spectrograms follow the particular definition of [1].
[1] Grill, T., & Schlüter, J. (2015, October).
Music Boundary Detection Using Neural Networks on Combined Features and Two-Level Annotations.
In ISMIR (pp. 531-537).
"""
import numpy as np
import librosa.core
import librosa.feature
import librosa.effects
from math import inf
import as_seg.model.errors as err
import IPython.display as ipd
mel_power = 2
# TODO: add MFCC, maybe tonnetz
def get_spectrogram(signal, sr, feature, hop_length, fmin = 98):
"""
Returns a spectrogram, from the signal of a song.
Different types of spectrogram can be computed, which are specified by the argument "feature".
All these spectrograms are computed with the toolbox librosa [1].
Parameters
----------
signal : numpy array
Signal of the song.
sr : float
Sampling rate of the signal, (typically 44100Hz).
feature : String
The types of spectrograms to compute.
TODO
hop_length : integer
The desired hop_length, which is the step between two frames (ie the time "discretization" step)
It is expressed in terms of number of samples, which are defined by the sampling rate.
fmin : integer, optional
The minimal frequence to consider, used for denoising.
The default is 98.
n_mfcc : integer, optional
Number of mfcc features.
The default is 20 (as in librosa).
Raises
------
InvalidArgumentValueException
If the "feature" argument is not presented above.
Returns
-------
numpy array
Spectrogram of the signal.
References
----------
[1] McFee, B., Raffel, C., Liang, D., Ellis, D. P., McVicar, M., Battenberg, E., & Nieto, O. (2015, July).
librosa: Audio and music signal analysis in python.
In Proceedings of the 14th python in science conference (Vol. 8).
[2] Grill, T., & Schlüter, J. (2015, October).
Music Boundary Detection Using Neural Networks on Combined Features and Two-Level Annotations.
In ISMIR (pp. 531-537).
"""
if feature.lower() == "pcp":
return compute_pcp(signal, sr, hop_length, fmin)
elif feature.lower() == "cqt":
return compute_cqt(signal, sr, hop_length)
# For Mel spectrograms, we use the same parameters as the ones of [2].
# [2] Grill, Thomas, and Jan Schlüter. "Music Boundary Detection Using Neural Networks on Combined Features and Two-Level Annotations." ISMIR. 2015.
elif feature.lower() == "mel":
return compute_mel_spectrogram(signal, sr, hop_length)
elif "mel" in feature:
mel_spectrogram = get_spectrogram(signal, sr, "mel", hop_length)
return get_log_mel_from_mel(mel_spectrogram, feature)
elif feature.lower() == "stft":
return compute_stft(signal, sr, hop_length, complex = False)
elif feature.lower() == "stft_complex":
return compute_stft(signal, sr, hop_length, complex = True)
else:
raise err.InvalidArgumentValueException(f"Unknown signal representation: {feature}.")
def get_default_frequency_dimension(feature):
if feature.lower() == "pcp":
return 12
elif feature.lower() == "cqt":
return 84
elif "mel" in feature.lower():
return 80
elif feature.lower() == "stft" or feature.lower() == "stft_complex":
return 1025
else:
raise err.InvalidArgumentValueException(f"Unknown signal representation: {feature}.")
def compute_pcp(signal, sr, hop_length, fmin):
norm=inf # Columns normalization
win_len_smooth=82 # Size of the smoothign window
n_octaves=6
bins_per_chroma = 3
bins_per_octave=bins_per_chroma * 12
return librosa.feature.chroma_cens(y=signal,sr=sr,hop_length=hop_length,
fmin=fmin, n_chroma=12, n_octaves=n_octaves, bins_per_octave=bins_per_octave,
norm=norm, win_len_smooth=win_len_smooth)
def compute_cqt(signal, sr, hop_length):
constant_q_transf = librosa.cqt(y=signal, sr = sr, hop_length = hop_length)
return np.abs(constant_q_transf)
def compute_mel_spectrogram(signal, sr, hop_length):
mel = librosa.feature.melspectrogram(y=signal, sr = sr, n_fft=2048, hop_length = hop_length, n_mels=80, fmin=80.0, fmax=16000, power=mel_power)
return np.abs(mel)
def get_log_mel_from_mel(mel_spectrogram, feature):
"""
Computes a variant of a Mel spectrogram (typically Log Mel).
Parameters
----------
mel_spectrogram : numpy array
Mel spectrogram of the signal.
feature : string
Desired feature name (must be a variant of a Mel spectrogram).
Raises
------
err.InvalidArgumentValueException
Raised in case of unknown feature name.
Returns
-------
numpy array
Variant of the Mel spectrogram of the signal.
"""
if feature == "log_mel":
return librosa.power_to_db(np.abs(mel_spectrogram), ref=1)
elif feature == "nn_log_mel":
mel_plus_one = np.abs(mel_spectrogram) + np.ones(mel_spectrogram.shape)
nn_log_mel = librosa.power_to_db(mel_plus_one, ref=1)
return nn_log_mel
elif feature == "padded_log_mel":
log_mel = get_log_mel_from_mel(mel_spectrogram, "log_mel")
return log_mel - np.amin(log_mel) * np.ones(log_mel.shape)
elif feature == "minmax_log_mel":
padded_log_mel = get_log_mel_from_mel(mel_spectrogram, "padded_log_mel")
return np.divide(padded_log_mel, np.amax(padded_log_mel))
else:
raise err.InvalidArgumentValueException("Unknown feature representation.")
def compute_stft(signal, sr, hop_length, complex):
stft = librosa.stft(y=signal, hop_length=hop_length,n_fft=2048)
if complex:
mag, phase = librosa.magphase(stft, power = 1)
return mag, phase
else:
return np.abs(stft)
def get_stft_from_mel(mel_spectrogram, feature, sr):
if feature == "mel":
return librosa.feature.inverse.mel_to_stft(M=mel_spectrogram, sr=sr, n_fft=2048, power=mel_power, fmin=80.0, fmax=16000)
elif feature == "log_mel":
mel = librosa.db_to_power(S_db=mel_spectrogram, ref=1)
return get_stft_from_mel(mel, "mel", sr=sr)
elif feature == "nn_log_mel":
mel = librosa.db_to_power(S_db=mel_spectrogram, ref=1) - np.ones(mel_spectrogram.shape)
return get_stft_from_mel(mel, "mel", sr=sr)
else:
raise err.InvalidArgumentValueException("Unknown feature representation.")
......@@ -30,5 +30,5 @@ path_data_persisted_salami = f"{path_parent_of_data}/data/annotations/salami" ##
path_entire_salami = "C:/Users/amarmore/Desktop/Audio samples/SALAMI" ## Path where are stored wav files of SALAMI (path where it is downloaded by mirdata also)
# Come Together
come_together = "C:/Users/amarmore/this_folder/The Beatles - Come Together"
#path_data_persisted_come_together = "C:/Users/amarmore/Desktop/data_persisted"
\ No newline at end of file
come_together = "/home/a23marmo/this_folder/The Beatles - Come Together"
path_data_persisted_come_together = "/home/a23marmo/Bureau/data_persisted/cometogether"
\ No newline at end of file