Source code for mirdata.datasets.beatport_key

"""beatport_key Dataset Loader

.. admonition:: Dataset Info
    :class: dropdown

    The Beatport EDM Key Dataset includes 1486 two-minute sound excerpts from various EDM
    subgenres, annotated with single-key labels, comments and confidence levels generously provided by Eduard Mas Marín,
    and thoroughly revised and expanded by Ángel Faraldo.

    The original audio samples belong to online audio snippets from Beatport, an online music store for DJ's and
    Electronic Dance Music Producers (<http:\\www.beatport.com>). If this dataset were used in further research,
    we would appreciate the citation of the current DOI (10.5281/zenodo.1101082) and the following doctoral dissertation,
    where a detailed description of the properties of this dataset can be found:

    .. code-block:: latex

        Ángel Faraldo (2017). Tonality Estimation in Electronic Dance Music: A Computational and Musically Informed
        Examination. PhD Thesis. Universitat Pompeu Fabra, Barcelona.

    This dataset is mainly intended to assess the performance of computational key estimation algorithms in electronic
    dance music subgenres.

    Data License: Creative Commons Attribution Share Alike 4.0 International

"""

import csv
import os
import fnmatch
import json

from deprecated.sphinx import deprecated
import librosa
from smart_open import open

from mirdata import core, download_utils, io

BIBTEX = """@phdthesis {3897,
    title = {Tonality Estimation in Electronic Dance Music: A Computational and Musically Informed Examination},
    year = {2018},
    month = {03/2018},
    pages = {234},
    school = {Universitat Pompeu Fabra},
    address = {Barcelona},
    abstract = {This dissertation revolves around the task of computational key estimation in electronic dance music, upon which three interrelated operations are performed. First, I attempt to detect possible misconceptions within the task, which is typically accomplished with a tonal vocabulary overly centred in Western classical tonality, reduced to a binary major/minor model which might not accomodate popular music styles. Second, I present a study of tonal practises in electronic dance music, developed hand in hand with the curation of a corpus of over 2,000 audio excerpts, including various subgenres and degrees of complexity. Based on this corpus, I propose the creation of more open-ended key labels, accounting for other modal practises and ambivalent tonal configurations. Last, I describe my own key finding methods, adapting existing models to the musical idiosyncrasies and tonal distributions of electronic dance music, with new statistical key profiles derived from the newly created corpus.},
    keywords = {EDM, Electronic Dance Music, Key Estimation, mir, music information retrieval, tonality},
    url = {https://doi.org/10.5281/zenodo.1154586},
    author = {{\'A}ngel Faraldo}
}"""


INDEXES = {
    "default": "1.0.0",
    "test": "sample",
    "1.0.0": core.Index(
        filename="beatport_key_index_1.0.0.json",
        url="https://zenodo.org/records/13993022/files/beatport_key_index_1.0.0.json?download=1",
        checksum="71291eec1a4791259d05fd9281c5cfbf",
    ),
    "sample": core.Index(filename="beatport_key_index_1.0.0_sample.json"),
}

REMOTES = {
    "keys": download_utils.RemoteFileMetadata(
        filename="keys.zip",
        url="https://zenodo.org/record/1101082/files/keys.zip?download=1",
        checksum="939abc05f36121badfac4087241ac172",
        destination_dir=".",
    ),
    "metadata": download_utils.RemoteFileMetadata(
        filename="original_metadata.zip",
        url="https://zenodo.org/record/1101082/files/original_metadata.zip?download=1",
        checksum="bb3e3ac1fe5dee7600ef2814accdf8f8",
        destination_dir=".",
    ),
    "audio": download_utils.RemoteFileMetadata(
        filename="audio.zip",
        url="https://zenodo.org/record/1101082/files/audio.zip?download=1",
        checksum="f490ee6c23578482d6fcfa11b82636a1",
        destination_dir=".",
    ),
}

LICENSE_INFO = "Creative Commons Attribution Share Alike 4.0 International."



[docs]
class Track(core.Track):
    """beatport_key track class

    Args:
        track_id (str): track id of the track
        data_home (str): Local path where the dataset is stored.

    Attributes:
        audio_path (str): track audio path
        keys_path (str): key annotation path
        metadata_path (str): sections annotation path
        title (str): title of the track
        track_id (str): track id

    Cached Properties:
        key (list): list of annotated musical keys
        artists (list): artists involved in the track
        genre (dict): genres and subgenres
        tempo (int): tempo in beats per minute

    """

    def __init__(self, track_id, data_home, dataset_name, index, metadata):
        super().__init__(track_id, data_home, dataset_name, index, metadata)

        self.keys_path = self.get_path("key")
        self.metadata_path = self.get_path("meta")
        self.audio_path = self.get_path("audio")

        self.title = self.audio_path.replace(".mp3", "").split("/")[-1]

    @core.cached_property
    def key(self):
        return load_key(self.keys_path)

    @core.cached_property
    def artists(self):
        return load_artist(self.metadata_path)

    @core.cached_property
    def genres(self):
        return load_genre(self.metadata_path)

    @core.cached_property
    def tempo(self):
        return load_tempo(self.metadata_path)

    @property
    def audio(self):
        """The track's audio

        Returns:
           * np.ndarray - audio signal
           * float - sample rate

        """
        return load_audio(self.audio_path)



# no decorator here because of https://github.com/librosa/librosa/issues/1267

[docs]
def load_audio(fpath):
    """Load a beatport_key audio file.

    Args:
        fpath (str): path to an audio file

    Returns:
        * np.ndarray - the mono audio signal
        * float - The sample rate of the audio file

    """
    return librosa.load(fpath, sr=None, mono=True)




[docs]
@io.coerce_to_string_io
def load_key(fhandle):
    """Load beatport_key format key data from a file

    Args:
        fhandle (str or file-like): path or file-like object pointing to
            a key annotation file

    Returns:
        list: list of annotated keys

    """
    reader = csv.reader(fhandle, delimiter="|")
    keys = next(reader)

    # standarize 'Unknown'  to 'X'
    keys = ["x" if k.lower() == "unknown" else k for k in keys]
    return keys




[docs]
@io.coerce_to_string_io
def load_tempo(fhandle):
    """Load beatport_key tempo data from a file

    Args:
        fhandle (str or file-like): path or file-like object pointing to
            metadata file

    Returns:
        str: tempo in beats per minute

    """
    return json.load(fhandle)["bpm"]




[docs]
@io.coerce_to_string_io
def load_genre(fhandle):
    """Load beatport_key genre data from a file

    Args:
        fhandle (str or file-like): path or file-like object pointing to
            metadata file

    Returns:
        dict: with the list with genres ['genres'] and list with sub-genres ['sub_genres']

    """
    meta = json.load(fhandle)
    return {
        "genres": [genre["name"] for genre in meta["genres"]],
        "sub_genres": [genre["name"] for genre in meta["sub_genres"]],
    }




[docs]
@io.coerce_to_string_io
def load_artist(fhandle):
    """Load beatport_key tempo data from a file

    Args:
        fhandle (str or file-like): path or file-like object pointing to
            metadata file

    Returns:
        list: list of artists involved in the track.

    """
    meta = json.load(fhandle)
    return [artist["name"] for artist in meta["artists"]]




[docs]
@core.docstring_inherit(core.Dataset)
class Dataset(core.Dataset):
    """
    The beatport_key dataset
    """

    def __init__(self, data_home=None, version="default"):
        super().__init__(
            data_home,
            version,
            name="beatport_key",
            track_class=Track,
            bibtex=BIBTEX,
            indexes=INDEXES,
            remotes=REMOTES,
            license_info=LICENSE_INFO,
        )


[docs]
    @deprecated(reason="Use mirdata.datasets.beatport_key.load_audio", version="0.3.4")
    def load_audio(self, *args, **kwargs):
        return load_audio(*args, **kwargs)



[docs]
    @deprecated(reason="Use mirdata.datasets.beatport_key.load_key", version="0.3.4")
    def load_key(self, *args, **kwargs):
        return load_key(*args, **kwargs)



[docs]
    @deprecated(reason="Use mirdata.datasets.beatport_key.load_tempo", version="0.3.4")
    def load_tempo(self, *args, **kwargs):
        return load_tempo(*args, **kwargs)



[docs]
    @deprecated(reason="Use mirdata.datasets.beatport_key.load_genre", version="0.3.4")
    def load_genre(self, *args, **kwargs):
        return load_genre(*args, **kwargs)



[docs]
    @deprecated(reason="Use mirdata.datasets.beatport_key.load_artist", version="0.3.4")
    def load_artist(self, *args, **kwargs):
        return load_artist(*args, **kwargs)



[docs]
    def download(self, partial_download=None, force_overwrite=False, cleanup=False):
        """Download the dataset

        Args:
            partial_download (list or None):
                A list of keys of remotes to partially download.
                If None, all data is downloaded
            force_overwrite (bool):
                If True, existing files are overwritten by the downloaded files.
            cleanup (bool):
                Whether to delete any zip/tar files after extracting.

        Raises:
            ValueError: if invalid keys are passed to partial_download
            IOError: if a downloaded file's checksum is different from expected

        """
        download_utils.downloader(
            self.data_home,
            remotes=self.remotes,
            index=self._index_data,
            partial_download=partial_download,
            force_overwrite=force_overwrite,
            cleanup=cleanup,
        )

        self._find_replace(
            os.path.join(self.data_home, "meta"), ": nan", ": null", "*.json"
        )


    def _find_replace(self, directory, find, replace, pattern):
        """Replace all the files with the format pattern "find" by "replace"

        Args:
            directory (str): path to directory
            find (str): string from replace
            replace (str): string to replace
            pattern (str): regex that must match the directories searched

        """
        for path, dirs, files in os.walk(os.path.abspath(directory)):
            for filename in fnmatch.filter(files, pattern):
                filepath = os.path.join(path, filename)
                with open(filepath) as f:
                    s = f.read()
                s = s.replace(find, replace)
                with open(filepath, "w") as f:
                    f.write(s)