Source code for mirdata.maestro

# -*- coding: utf-8 -*-
"""MAESTRO Dataset Loader

MAESTRO (MIDI and Audio Edited for Synchronous TRacks and Organization) is a
dataset composed of over 200 hours of virtuosic piano performances captured
with fine alignment (~3 ms) between note labels and audio waveforms.

The dataset is created and released by Google's Magenta team.

The dataset contains over 200 hours of paired audio and MIDI recordings from
ten years of International Piano-e-Competition. The MIDI data includes key
strike velocities and sustain/sostenuto/una corda pedal positions. Audio and
MIDI files are aligned with ∼3 ms accuracy and sliced to individual musical
pieces, which are annotated with composer, title, and year of performance.
Uncompressed audio is of CD quality or higher (44.1–48 kHz 16-bit PCM stereo).

A train/validation/test split configuration is also proposed, so that the same
composition, even if performed by multiple contestants, does not appear in
multiple subsets. Repertoire is mostly classical, including composers from the
17th to early 20th century.

The dataset is made available by Google LLC under a Creative Commons
Attribution Non-Commercial Share-Alike 4.0 (CC BY-NC-SA 4.0) license.

This loader supports MAESTRO version 2.

For more details, please visit: https://magenta.tensorflow.org/datasets/maestro
"""

import json
import glob
import logging
import os
import shutil

import librosa
import numpy as np
import pretty_midi

from mirdata import download_utils
from mirdata import jams_utils
from mirdata import track
from mirdata import utils

DATASET_DIR = 'MAESTRO'

REMOTES = {
    'all': download_utils.RemoteFileMetadata(
        filename='maestro-v2.0.0.zip',
        url='https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0.zip',
        checksum='7a6c23536ebcf3f50b1f00ac253886a7',
        destination_dir='',
    ),
    'midi': download_utils.RemoteFileMetadata(
        filename='maestro-v2.0.0-midi.zip',
        url='https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip',
        checksum='8a45cc678a8b23cd7bad048b1e9034c5',
        destination_dir='',
    ),
    'metadata': download_utils.RemoteFileMetadata(
        filename='maestro-v2.0.0.json',
        url='https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0.json',
        checksum='576172af1cdc4efddcf0be7d260d48f7',
        destination_dir='maestro-v2.0.0',
    ),
}


def _load_metadata(data_home):
    metadata_path = os.path.join(data_home, 'maestro-v2.0.0.json')
    if not os.path.exists(metadata_path):
        logging.info("Metadata file {} not found.".format(metadata_path))
        return None

    # load metadata however makes sense for your dataset
    with open(metadata_path, 'r') as fhandle:
        raw_metadata = json.load(fhandle)

    metadata = {}
    for mdata in raw_metadata:
        track_id = mdata['midi_filename'].split('.')[0]
        metadata[track_id] = mdata

    metadata['data_home'] = data_home

    return metadata


DATA = utils.LargeData('maestro_index.json', _load_metadata)


[docs]class Track(track.Track):
    """MAESTRO Track class

    Args:
        track_id (str): track id of the track
        data_home (str): Local path where the dataset is stored. default=None
            If `None`, looks for the data in the default directory, `~/mir_datasets`

    Attributes:
        audio_path (str): Path to the track's audio file
        canonical_composer (str): Composer of the piece, standardized on a
            single spelling for a given name.
        canonical_title (str): Title of the piece. Not guaranteed to be
            standardized to a single representation.
        duration (float): Duration in seconds, based on the MIDI file.
        midi_path (str): Path to the track's MIDI file
        split (str): Suggested train/validation/test split.
        track_id (str): track id
        year (int): Year of performance.

    """

    def __init__(self, track_id, data_home=None):
        if track_id not in DATA.index:
            raise ValueError('{} is not a valid track ID in MAESTRO'.format(track_id))

        self.track_id = track_id

        if data_home is None:
            data_home = utils.get_default_dataset_path(DATASET_DIR)

        self._data_home = data_home
        self._track_paths = DATA.index[track_id]

        self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0])
        self.midi_path = os.path.join(self._data_home, self._track_paths['midi'][0])

        self._metadata = DATA.metadata(data_home)
        if self._metadata is not None and track_id in self._metadata:
            self.canonical_composer = self._metadata[track_id]['canonical_composer']
            self.canonical_title = self._metadata[track_id]['canonical_title']
            self.split = self._metadata[track_id]['split']
            self.year = self._metadata[track_id]['year']
            self.duration = self._metadata[track_id]['duration']
        else:
            self.canonical_composer = None
            self.canonical_title = None
            self.split = None
            self.year = None
            self.duration = None

    @utils.cached_property
    def midi(self):
        """output type: description of output"""
        return load_midi(self.midi_path)

    @utils.cached_property
    def notes(self):
        """NoteData: annotated piano notes"""
        return load_notes(self.midi_path, self.midi)

    @property
    def audio(self):
        """(np.ndarray, float): track's audio signal, sample rate"""
        return load_audio(self.audio_path)

[docs]    def to_jams(self):
        """Jams: the track's data in jams format"""
        return jams_utils.jams_converter(
            audio_path=self.audio_path,
            note_data=[(self.notes, None)],
            metadata=self._metadata,
        )


[docs]def load_midi(midi_path):
    """Load a MAESTRO midi file.

    Args:
        midi_path (str): path to midi file

    Returns:
        midi_data (obj): pretty_midi object

    """
    if not os.path.exists(midi_path):
        raise IOError("midi_path {} does not exist".format(midi_path))

    return pretty_midi.PrettyMIDI(midi_path)


[docs]def load_notes(midi_path, midi=None):
    """Load note data from the midi file.

    Args:
        midi_path (str): path to midi file
        midi (pretty_midi.PrettyMIDI): pre-loaded midi object or None
            if None, the midi object is loaded using midi_path

    Returns:
        note_data (NoteData)

    """
    if midi is None:
        midi = load_midi(midi_path)

    intervals = []
    pitches = []
    confidence = []
    for note in midi.instruments[0].notes:
        intervals.append([note.start, note.end])
        pitches.append(librosa.midi_to_hz(note.pitch))
        confidence.append(note.velocity)
    return utils.NoteData(np.array(intervals), np.array(pitches), np.array(confidence))


[docs]def load_audio(audio_path):
    """Load a MAESTRO audio file.

    Args:
        audio_path (str): path to audio file

    Returns:
        y (np.ndarray): the mono audio signal
        sr (float): The sample rate of the audio file

    """
    if not os.path.exists(audio_path):
        raise IOError("audio_path {} does not exist".format(audio_path))
    return librosa.load(audio_path, sr=None, mono=True)


[docs]def download(
    data_home=None, partial_download=None, force_overwrite=False, cleanup=True
):
    """Download the dataset.

    Args:
        data_home (str):
            Local path where the dataset is stored.
            If `None`, looks for the data in the default directory, `~/mir_datasets`
        force_overwrite (bool):
            Whether to overwrite the existing downloaded data
        partial_download (list):
            List indicating what to partially download. The list can include any of:
                * 'all': audio, midi and metadata
                * 'midi': midi and metadata only
                * 'metadata': metadata only
            If `None`, all data is downloaded.
        cleanup (bool):
            Whether to delete the zip/tar file after extracting.

    """
    if data_home is None:
        data_home = utils.get_default_dataset_path(DATASET_DIR)

    # in MAESTRO "metadata" is contained in "midi" is contained in "all"
    if partial_download is None or 'all' in partial_download:
        partial_download = ['all']
    elif 'midi' in partial_download:
        partial_download = ['midi']

    download_utils.downloader(
        data_home,
        remotes=REMOTES,
        partial_download=partial_download,
        force_overwrite=force_overwrite,
        cleanup=cleanup,
    )

    # files get downloaded to a folder called maestro-v2.0.0
    # move everything up a level
    maestro_dir = os.path.join(data_home, 'maestro-v2.0.0')
    maestro_files = glob.glob(os.path.join(maestro_dir, '*'))

    for fpath in maestro_files:
        shutil.move(fpath, data_home)

    if os.path.exists(maestro_dir):
        shutil.rmtree(maestro_dir)


[docs]def validate(data_home=None, silence=False):
    """Validate if the stored dataset is a valid version

    Args:
        data_home (str): Local path where the dataset is stored.
            If `None`, looks for the data in the default directory, `~/mir_datasets`
    Returns:
        missing_files (list): List of file paths that are in the dataset index
            but missing locally
        invalid_checksums (list): List of file paths that file exists in the dataset
            index but has a different checksum compare to the reference checksum
    """
    if data_home is None:
        data_home = utils.get_default_dataset_path(DATASET_DIR)

    missing_files, invalid_checksums = utils.validator(
        DATA.index, data_home, silence=silence
    )
    return missing_files, invalid_checksums


[docs]def track_ids():
    """Return track ids

    Returns:
        (list): A list of track ids
    """
    return list(DATA.index.keys())


[docs]def load(data_home=None):
    """Load MAESTRO dataset

    Args:
        data_home (str): Local path where the dataset is stored.
            If `None`, looks for the data in the default directory, `~/mir_datasets`
    Returns:
        (dict): {`track_id`: track data}
    """
    if data_home is None:
        data_home = utils.get_default_dataset_path(DATASET_DIR)

    data = {}
    for key in DATA.index.keys():
        data[key] = Track(key, data_home=data_home)
    return data


[docs]def cite():
    """Print the reference"""

    cite_data = """
=========== MLA ===========
Curtis Hawthorne, Andriy Stasyuk, Adam Roberts, Ian Simon, Cheng-Zhi Anna Huang,
  Sander Dieleman, Erich Elsen, Jesse Engel, and Douglas Eck. "Enabling
  Factorized Piano Music Modeling and Generation with the MAESTRO Dataset."
  In International Conference on Learning Representations, 2019.
========== Bibtex ==========
@inproceedings{
  hawthorne2018enabling,
  title={Enabling Factorized Piano Music Modeling and Generation with the {MAESTRO} Dataset},
  author={Curtis Hawthorne and Andriy Stasyuk and Adam Roberts and Ian Simon and Cheng-Zhi Anna Huang and Sander Dieleman and Erich Elsen and Jesse Engel and Douglas Eck},
  booktitle={International Conference on Learning Representations},
  year={2019},
  url={https://openreview.net/forum?id=r1lYRjC9F7},
}
"""
    print(cite_data)