Source code for mirdata.datasets.irmas

"""
IRMAS Loader

.. admonition:: Dataset Info
    :class: dropdown

    IRMAS: a dataset for instrument recognition in musical audio signals

    This dataset includes musical audio excerpts with annotations of the predominant instrument(s) present.
    It was used for the evaluation in the following article:

    .. code-block:: latex

        Bosch, J. J., Janer, J., Fuhrmann, F., & Herrera, P. “A Comparison of Sound Segregation Techniques for
        Predominant Instrument Recognition in Musical Audio Signals”, in Proc. ISMIR (pp. 559-564), 2012.

    IRMAS is intended to be used for training and testing methods for the automatic recognition of predominant
    instruments in musical audio. The instruments considered are: cello, clarinet, flute, acoustic guitar,
    electric guitar, organ, piano, saxophone, trumpet, violin, and human singing voice.
    This dataset is derived from the one compiled by Ferdinand Fuhrmann in his PhD thesis, with the difference
    that we provide audio data in stereo format, the annotations in the testing dataset are limited to specific
    pitched instruments, and there is a different amount and lenght of excerpts from the original dataset.


    The dataset is split into training and test data.

    **Training data**

    Total audio samples: 6705
    They are excerpts of 3 seconds from more than 2000 distinct recordings.

    Audio specifications

    * Sampling frequency: 44.1 kHz
    * Bit-depth: 16 bit
    * Audio format: .wav

    IRMAS Dataset trainig samples are annotated by storing the information of each track in their filenames.

    * Predominant instrument:

        * The annotation of the predominant instrument of each excerpt is both in the name of the containing
          folder, and in the file name: cello (cel), clarinet (cla), flute (flu), acoustic guitar (gac),
          electric guitar (gel), organ (org), piano (pia), saxophone (sax), trumpet (tru), violin (vio),
          and human singing voice (voi).
        * The number of files per instrument are: cel(388), cla(505), flu(451), gac(637), gel(760), org(682),
          pia(721), sax(626), tru(577), vio(580), voi(778).

    * Drum presence

        * Additionally, some of the files have annotation in the filename regarding the presence ([dru])
          or non presence([nod]) of drums.

    * The annotation of the musical genre:

        * country-folk ([cou_fol])
        * classical ([cla]),
        * pop-rock ([pop_roc])
        * latin-soul ([lat_sou])
        * jazz-blues ([jaz_blu]).

    **Testing data**

    Total audio samples: 2874

    Audio specifications

    * Sampling frequency: 44.1 kHz
    * Bit-depth: 16 bit
    * Audio format: .wav

    IRMAS Dataset testing samples are annotated by the following basis:

    * Predominant instrument:

        The annotations for an excerpt named: “excerptName.wav” are given in “excerptName.txt”. More than one
        instrument may be annotated in each excerpt, one label per line. This part of the dataset contains excerpts
        from a diversity of western musical genres, with varied instrumentations, and it is derived from the original
        testing dataset from Fuhrmann (http://www.dtic.upf.edu/~ffuhrmann/PhD/).
        Instrument nomenclatures are the same as the training dataset.

    Dataset compiled by Juan J. Bosch, Ferdinand Fuhrmann, Perfecto Herrera,
    Music Technology Group - Universitat Pompeu Fabra (Barcelona).

    The IRMAS dataset is offered free of charge for non-commercial use only. You can not redistribute it nor modify it.
    This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License

    For more details, please visit: https://www.upf.edu/web/mtg/irmas

"""

import csv
import os
from typing import BinaryIO, List, Optional, TextIO, Tuple

from deprecated.sphinx import deprecated
import librosa
import numpy as np

from mirdata import core, download_utils, io, jams_utils

BIBTEX = """
@dataset{juan_j_bosch_2014_1290750,
  author       = {Juan J. Bosch and Ferdinand Fuhrmann and Perfecto Herrera},
  title        = {{IRMAS: a dataset for instrument recognition in musical audio signals}},
  month        = sep,
  year         = 2014,
  publisher    = {Zenodo},
  version      = {1.0},
  doi          = {10.5281/zenodo.1290750},
  url          = {https://doi.org/10.5281/zenodo.1290750}
"""

INDEXES = {
    "default": "1.0",
    "test": "1.0",
    "1.0": core.Index(filename="irmas_index_1.0.json"),
}

REMOTES = {
    "training_data": download_utils.RemoteFileMetadata(
        filename="IRMAS-TrainingData.zip",
        url="https://zenodo.org/record/1290750/files/IRMAS-TrainingData.zip?download=1",
        checksum="4fd9f5ed5a18d8e2687e6360b5f60afe",
    ),
    "testing_data_1": download_utils.RemoteFileMetadata(
        filename="IRMAS-TestingData-Part1.zip",
        url="https://zenodo.org/record/1290750/files/IRMAS-TestingData-Part1.zip?download=1",
        checksum="5a2e65520dcedada565dff2050bb2a56",
    ),
    "testing_data_2": download_utils.RemoteFileMetadata(
        filename="IRMAS-TestingData-Part2.zip",
        url="https://zenodo.org/record/1290750/files/IRMAS-TestingData-Part2.zip?download=1",
        checksum="afb0c8ea92f34ee653693106be95c895",
    ),
    "testing_data_3": download_utils.RemoteFileMetadata(
        filename="IRMAS-TestingData-Part3.zip",
        url="https://zenodo.org/record/1290750/files/IRMAS-TestingData-Part3.zip?download=1",
        checksum="9b3fb2d0c89cdc98037121c25bd5b556",
    ),
}


INST_DICT = [
    "cel",
    "cla",
    "flu",
    "gac",
    "gel",
    "org",
    "pia",
    "sax",
    "tru",
    "vio",
    "voi",
]

GENRE_DICT = ["cou_fol", "cla", "pop_roc", "lat_sou", "jaz_blu"]

LICENSE_INFO = (
    "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License."
)


[docs] class Track(core.Track): """IRMAS track class Args: track_id (str): track id of the track data_home (str): Local path where the dataset is stored. If `None`, looks for the data in the default directory, `~/mir_datasets/Mridangam-Stroke` Attributes: track_id (str): track id predominant_instrument (list): Training tracks predominant instrument train (bool): flag to identify if the track is from the training of the testing dataset genre (str): string containing the namecode of the genre of the track. drum (bool): flag to identify if the track contains drums or not. split (str): data split ("train" or "test") Cached Properties: instrument (list): list of predominant instruments as str """ def __init__(self, track_id, data_home, dataset_name, index, metadata): super().__init__(track_id, data_home, dataset_name, index, metadata) self.annotation_path = self.get_path("annotation") self.audio_path = self.get_path("audio") self._audio_filename = self._track_paths["audio"][0] self.split = "train" if "__" in track_id else "test" # Dataset attributes self.predominant_instrument = ( os.path.basename(os.path.dirname(self.audio_path)) if self.split == "train" else None ) if self.split == "train" and ( "dru" in self._audio_filename or "nod" in self._audio_filename ): self.genre = self._audio_filename.split(".")[0].split("[")[3].split("]")[0] self.drum = [True if "dru" in self._audio_filename else False][0] elif self.split == "train" and not ( "dru" in self._audio_filename or "nod" in self._audio_filename ): self.genre = self._audio_filename.split(".")[0].split("[")[2].split("]")[0] self.drum = None else: self.genre = None self.drum = None self.train = True if self.split == "train" else False @core.cached_property def instrument(self): if self.predominant_instrument is not None: return [self.predominant_instrument] return load_pred_inst(self.annotation_path) @property def audio(self) -> Optional[Tuple[np.ndarray, float]]: """The track's audio signal Returns: * np.ndarray - the mono audio signal * float - The sample rate of the audio file """ return load_audio(self.audio_path)
[docs] def to_jams(self): """the track's data in jams format Returns: jams.JAMS: return track data in jam format """ return jams_utils.jams_converter( audio_path=self.audio_path, metadata={ "instrument": self.instrument, "genre": self.genre, "drum": self.drum, "train": self.train, }, )
[docs] @io.coerce_to_bytes_io def load_audio(fhandle: BinaryIO) -> Tuple[np.ndarray, float]: """Load a IRMAS dataset audio file. Args: fhandle (str or file-like): File-like object or path to audio file Returns: * np.ndarray - the mono audio signal * float - The sample rate of the audio file """ return librosa.load(fhandle, sr=44100, mono=False)
[docs] @io.coerce_to_string_io def load_pred_inst(fhandle: TextIO) -> List[str]: """Load predominant instrument of track Args: fhandle (str or file-like): File-like object or path where the test annotations are stored. Returns: list(str): test track predominant instrument(s) annotations """ pred_inst = [] reader = csv.reader(fhandle, delimiter=" ") for line in reader: inst_code = line[0][:3] assert ( inst_code in INST_DICT ), "Instrument {} not in instrument dictionary".format(inst_code) pred_inst.append(inst_code) return pred_inst
[docs] @core.docstring_inherit(core.Dataset) class Dataset(core.Dataset): """ The irmas dataset """ def __init__(self, data_home=None, version="default"): super().__init__( data_home, version, name="irmas", track_class=Track, bibtex=BIBTEX, indexes=INDEXES, remotes=REMOTES, license_info=LICENSE_INFO, )
[docs] @deprecated(reason="Use mirdata.datasets.irmas.load_audio", version="0.3.4") def load_audio(self, *args, **kwargs): return load_audio(*args, **kwargs)
[docs] @deprecated(reason="Use mirdata.datasets.irmas.load_pred_inst", version="0.3.4") def load_pred_inst(self, *args, **kwargs): return load_pred_inst(*args, **kwargs)