Source code for mirdata.datasets.filosax

"""Filosax Dataset Loader

.. admonition:: Dataset Info
    :class: dropdown

    The Filosax dataset was conceived, curated and compiled by Dave Foster (a PhD student on the AIM programme at QMUL) and his supervisor Simon Dixon (C4DM @ QMUL).
    The dataset is a collection of 48 multitrack jazz recordings, where each piece has 8 corresponding audio files:
    
    1) The original Aebersold backing track (stereo)
    2) Bass_Drums, a mono file of a mix of bass and drums
    3) Piano_Drums, a mono file of a mix of piano and drums
    4) Participant 1 Sax, a mono file of solo saxophone
    5) Participant 2 Sax, a mono file of solo saxophone
    6) Participant 3 Sax, a mono file of solo saxophone
    7) Participant 4 Sax, a mono file of solo saxophone
    8) Participant 5 Sax, a mono file of solo saxophone
    
    Each piece is ~6mins long, so each of the 8 stems contains ~5hours of audio
    
    For each piece, there is a corresponding .jams file containing piece-level annotations:
    
    1) Beat annotation for the start of each bar and any mid-bar chord change
    2) Chord annotation for each bar, and mid-bar chord change
    3) Section annotation for when the solo changes between the 3 categories:
        a) head (melody)
        b) written solo (interpretation of transcribed solo)
        c) improvised solo
        
    For each Sax recording (5 per piece), there is a corresponding .json file containing note annotations (see Note object).
    
    The Participant folders also contain MIDI files of the transcriptions (frame level and score level) as well as a PDF and MusicXML of the typeset solo.
    
    The dataset comes in 2 flavours: full (all 48 tracks and 5 sax players) and lite (5 tracks and 2 sax players).
    Both flavours can be used with or without the backing tracks (which need to be purchased online).
    Hence, when opening the dataset, use one of 4 versions: 'full', 'full_sax', 'lite', 'lite_sax'.

"""
import csv
import json
import os
import jams
from typing import BinaryIO, Dict, Optional, TextIO, Tuple, List

import librosa
import numpy as np
from smart_open import open

from mirdata import download_utils, jams_utils, core, annotations, io

BIBTEX = """
@inproceedings{
  foster_filosax_2021,
  title={Filosax: A Dataset of Annotated Jazz Saxophone Recordings},
  author={Foster, Dave and Dixon, Simon},
  booktitle={International Society for Music Information Retrieval (ISMIR) Conference},
  year={2021}
}
"""

INDEXES = {
    "default": "full_1.0",
    "full": "full_1.0",
    "full_sax": "full_sax_1.0",
    "lite": "lite_1.0",
    "lite_sax": "lite_sax_1.0",
    "test": "test",
    "full_1.0": core.Index(filename="filosax_index_full_1.0.json"),
    "full_sax_1.0": core.Index(filename="filosax_index_full_sax_1.0.json"),
    "lite_1.0": core.Index(filename="filosax_index_lite_1.0.json"),
    "lite_sax_1.0": core.Index(filename="filosax_index_lite_sax_1.0.json"),
    "test": core.Index(filename="filosax_index_lite_1.0.json"),
}

DOWNLOAD_INFO = """
To download the dataset, first go to the Zenodo pages below to request access:

(Full - 14.5GB)
https://zenodo.org/record/5643843#.YYL7aS2l3UI

(Lite - 558MB)
https://zenodo.org/record/5643734#.YYLQ-i2l3UI

Unzip the downloaded file to the folder /Users/<username>/mir_datasets/ (or wherever data_home has been assigned on initialization), and remove the version number from the folder:

(Full)
/Users/<username>/mir_datasets/Filosax

(Lite)
/Users/<username>/mir_datasets/Filosax_Lite

This data is sufficient to use the dataset in the "_sax" (sax only) mode. To download the backing data, go to the Aebersold sites:

(Full)
https://www.jazzbooks.com/mm5/merchant.mvc?&Screen=WISH&Store_Code=JAJAZZ&WishList_ID=1679

(Lite)
https://www.jazzbooks.com/mm5/merchant.mvc?&Screen=WISH&Store_Code=JAJAZZ&WishList_ID=1678

Put the files downloaded into the "/Aebersold" folder, and then run the appropriate script from inside the home folder:

(Full)
python Scripts/Compile_Backing.py -version full

(Lite)
python Scripts/Compile_Backing.py -version lite

which populates the "/Backing" folder with edited files, which match the versions that were used in the recordings.

"""

LICENSE_INFO = """
The Filosax dataset contains copyright material and is shared with researchers under the following conditions:
1. Filosax may only be used by the individual signing below and by members of the research group or organisation of this individual. This permission is not transferable.
2. Filosax may be used only for non-commercial research purposes.
3. Filosax (or data enabling the its reproduction) may not be sold, leased, published or distributed to any third party without written permission from the Filosax administrator.
4. When research results obtained using Filosax are publicly released (in the form of reports, publications, or derivative software), clear indication of the use of Filosax shall be given, usually in the form of a citation of the following paper:
    D. Foster and S. Dixon (2021),  Filosax: A Dataset of Annotated Jazz Saxophone Recordings.
    22nd International Society for Music Information Retrieval Conference (ISMIR).
5. Queen Mary University of London shall not be held liable for any errors in the content of Filosax nor damage arising from the use of Filosax.
6. The Filosax administrator may update these conditions of use at any time. 
"""


[docs]class Note:
    """Filosax Note class - dictionary wrapper to give dot properties

    Args:
        input_dict (dict): dictionary of attributes

    Attributes:
        a_start_time (float): the time stamp of the note start, in seconds
        a_end_time (float): the time stamp of the note end, in seconds
        a_duration (float): the duration of the note, in seconds
        a_onset_time (float): the onset time (compared to a_start_time) (filosax_full only, 0.0 otherwise)
        midi_pitch (int): the quantised midi pitch
        crochet_num (int): the number of sub-divisions which define a crochet (always 24)
        musician (int): the participant ID
        bar_num (int): the bar number of the start of the note
        s_start_time (float): the time stamp of the score note start, in seconds
        s_duration (float): the duration of the score note, in seconds
        s_end_time (float): the time stamp of the score note end, in seconds
        s_rhythmic_duration (int): the duration of the score note (compared to crochet_num)
        s_rhythmic_position (int): the position in the bar of the score note start (compared to crochet_num)
        tempo (float): the tempo at the start of the note, in beats per minute
        bar_type (int): the section annotation where 0 = head, 1 = written solo, 2 = improvised solo
        is_grace (bool): is the note a grace note, associated with the following note
        chord_changes (dict): the chords, where the key is the rhythmic position of the chord (using crochet_num, relative to s_rhythmic_position) and the value a JAMS chord annotation  (An additional chord is added in the case of a quaver at the end of the bar, followed by a rest on the downbeat)
        num_chord_changes (int): the number of chords which accompany the note (usually 1, sometimes >1 for long notes)
        main_chord_num (int): usually 0, sometimes 1 in the quaver case described above
        scale_changes (list, int): the degree of the chromatic scale when midi_pitch is compared to chord_root
        loudness_max_val (float): the value (db) of the maximum loudness
        loudness_max_time (float): the time (seconds) of the maximum loudness (compared to a_start_time)
        loudness_curve (list, float): the inter-note loudness values, 1 per millisecond
        pitch_average_val (float): the value (midi) of the average pitch and
        pitch_average_time (float): the time (seconds) of the average pitch (compared to a_start_time)
        pitch_curve (list, float): the inter-note pitch values, 1 per millisecond
        pitch_vib_freq (float): the vibrato frequency (Hz), 0.0 if no vibrato detected
        pitch_vib_ext (float): the vibrato extent (midi), 0.0 if no vibrato detected
        spec_cent (float): the spectral centroid value at the time of the maximum loudness
        spec_flux (float): the spectral flux value at the time of the maximum loudness
        spec_cent_curve (list, float): the inter-note spectral centroid values, 1 per millisecond
        spec_flux_curve (list, float): the inter-note spectral flux values, 1 per millisecond
        seq_len (int): the length of the phrase in which the note falls (filosax_full only, -1 otherwise)
        seq_num (int): the note position in the phrase (filosax_full only, -1 otherwise)

    """

    def __init__(self, input_dict):
        self.a_start_time = (
            input_dict["a_start_time"] if "a_start_time" in input_dict else 0.0
        )
        self.a_end_time = (
            input_dict["a_end_time"] if "a_end_time" in input_dict else 0.0
        )
        self.a_duration = (
            input_dict["a_duration"] if "a_duration" in input_dict else 0.0
        )
        self.a_onset_time = (
            input_dict["a_onset_time"] if "a_onset_time" in input_dict else 0.0
        )
        self.midi_pitch = input_dict["midi_pitch"] if "midi_pitch" in input_dict else 0
        self.crochet_num = (
            input_dict["crochet_num"] if "crochet_num" in input_dict else 24
        )
        self.musician = input_dict["musician"] if "musician" in input_dict else 1
        self.bar_num = input_dict["bar_num"] if "bar_num" in input_dict else 1
        self.s_start_time = (
            input_dict["s_start_time"] if "s_start_time" in input_dict else 0.0
        )
        self.s_duration = (
            input_dict["s_duration"] if "s_duration" in input_dict else 0.0
        )
        self.s_end_time = (
            (self.s_start_time + self.s_duration)
            if "s_start_time" in input_dict
            else 0.0
        )
        self.s_rhythmic_duration = (
            input_dict["s_rhythmic_duration"]
            if "s_rhythmic_duration" in input_dict
            else 0.0
        )
        self.s_rhythmic_position = (
            input_dict["s_rhythmic_position"]
            if "s_rhythmic_position" in input_dict
            else 0.0
        )
        self.tempo = input_dict["tempo"] if "tempo" in input_dict else 0.0
        self.bar_type = input_dict["bar_type"] if "bar_type" in input_dict else 1
        self.is_grace = input_dict["is_grace"] if "is_grace" in input_dict else 0
        self.chord_changes = (
            input_dict["chord_changes"] if "chord_changes" in input_dict else [0]
        )
        self.num_chord_changes = (
            input_dict["num_chord_changes"] if "num_chord_changes" in input_dict else 0
        )
        self.main_chord_num = (
            input_dict["main_chord_num"] if "main_chord_num" in input_dict else 0
        )
        self.scale_changes = (
            input_dict["scale_changes"] if "scale_changes" in input_dict else [0]
        )
        self.loudness_max_val = (
            input_dict["loudness_max_val"] if "loudness_max_val" in input_dict else 0.0
        )
        self.loudness_max_time = (
            input_dict["loudness_max_time"]
            if "loudness_max_time" in input_dict
            else 0.0
        )
        self.loudness_curve = (
            input_dict["loudness_curve"] if "loudness_curve" in input_dict else [0.0]
        )
        self.pitch_average_val = (
            input_dict["pitch_average_val"]
            if "pitch_average_val" in input_dict
            else 0.0
        )
        self.pitch_average_time = (
            input_dict["pitch_average_time"]
            if "pitch_average_time" in input_dict
            else 0.0
        )
        self.pitch_curve = (
            input_dict["pitch_curve"] if "pitch_curve" in input_dict else [0.0]
        )
        self.pitch_vib_freq = (
            input_dict["pitch_vib_freq"] if "pitch_vib_freq" in input_dict else 0.0
        )
        self.pitch_vib_ext = (
            input_dict["pitch_vib_ext"] if "pitch_vib_ext" in input_dict else 0.0
        )
        self.spec_cent = input_dict["spec_cent"] if "spec_cent" in input_dict else 0.0
        self.spec_flux = input_dict["spec_flux"] if "spec_flux" in input_dict else 0.0
        self.spec_cent_curve = (
            input_dict["spec_cent_curve"] if "spec_cent_curve" in input_dict else [0.0]
        )
        self.spec_flux_curve = (
            input_dict["spec_flux_curve"] if "spec_flux_curve" in input_dict else [0.0]
        )
        self.seq_len = input_dict["seq_len"] if "seq_len" in input_dict else -1
        self.seq_num = input_dict["seq_num"] if "seq_len" in input_dict else -1


[docs]class Track(core.Track):
    """Filosax track class

    Args:
        track_id (str): track id of the track

    Attributes:
        audio_path (str): path to audio file
        annotation_path (str): path to annotation file
        midi_path (str): path to MIDI file
        musicXML_path (str): path to musicXML file
        pdf_path (str): path to PDF file

    Cached Properties:
        notes (list, Note): an ordered list of Note objects

    """

    def __init__(self, track_id, data_home, dataset_name, index, metadata):
        super().__init__(
            track_id,
            data_home,
            dataset_name=dataset_name,
            index=index,
            metadata=metadata,
        )

        self.audio_path = self.get_path("audio")
        self.annotation_path = self.get_path("annotation")
        self.midi_path = self.get_path("midi")
        self.musicXML_path = self.get_path("musicXML")
        self.pdf_path = self.get_path("pdf")

    @core.cached_property
    def notes(self) -> Optional[List[Note]]:
        """The track's note list - only for Sax files

        Returns:
            * [Note] - ordered list of Note objects (empty if Backing file)

        """
        if not self.annotation_path:
            return [Note({})]
        else:
            return load_annotation(self.annotation_path)

    @property
    def audio(self) -> Optional[Tuple[np.ndarray, float]]:
        """The track's audio

        Returns:
            * np.ndarray - audio signal
            * float - sample rate

        """
        return load_audio(self.audio_path)

    def to_jams(self):
        return jams_utils.jams_converter(audio_path=self.audio_path)


[docs]class MultiTrack(core.MultiTrack):
    """Filosax multitrack class

    Args:
        mtrack_id (str): multitrack id
        data_home (str): Local path where the dataset is stored.
            If `None`, looks for the data in the default directory, `~/mir_datasets/Filosax`

    Attributes:
        mtrack_id (str): track id
        tracks (dict): {track_id: Track}
        track_audio_property (str): the name of the attribute of Track which
            returns the audio to be mixed
        name (str): the name of the tune
        duration (float): the duration, in seconds
        beats (list, Observation): the time and beat numbers of bars and chord changes
        chords (list, Observation): the time of chord changes
        segments (list, Observation): the time of segment changes
        bass_drums (Track): the associated bass/drums track
        piano_drums (Track): the associated piano/drums track
        sax (list, Track): a list of associated sax tracks

    Cached Properties:
        annotation (jams.JAMS): a .jams file containing the annotations

    """

    def __init__(
        self, mtrack_id, data_home, dataset_name, index, track_class, metadata
    ):
        super().__init__(
            mtrack_id=mtrack_id,
            data_home=data_home,
            dataset_name=dataset_name,
            index=index,
            track_class=track_class,
            metadata=metadata,
        )
        self.annotation_path = self.get_path("annotations")

    @property
    def track_audio_property(self):
        return "audio"

    @core.cached_property
    def annotation(self) -> Optional[annotations.EventData]:
        """output type: .jams file"""
        return jams.load(self.annotation_path)

    @property
    def name(self):
        """The track's name

        Returns:
            * str - track name

        """
        return self.annotation["file_metadata"]["title"]

    @property
    def duration(self):
        """The track's duration

        Returns:
            * float - track duration (in seconds)

        """
        return self.annotation["file_metadata"]["duration"]

    @property
    def beats(self):
        """The times of downbeats and chord changes

        Returns:
            * (SortedKeyList, Observation) - timestamp, duration (seconds), beat

        """
        return self.annotation.search(namespace="beat")[0]["data"]

    @property
    def chords(self):
        """The times and values of chord changes

        Returns:
            * (SortedKeyList, Observation) - timestamp, duration (seconds), chord symbol

        """
        return self.annotation.search(namespace="chord")[0]["data"]

    @property
    def segments(self):
        """The times of segment changes (values are 'head', 'written solo', 'improvised solo')

        Returns:
            * (SortedKeyList, Observation) - timestamp, duration (seconds), beat

        """
        return self.annotation.search(namespace="segment_open")[0]["data"]

    @property
    def bass_drums(self):
        """The associated bass/drums track

        Returns:
            * Track

        """
        return self.tracks[self.mtrack_id + "_bass_drums"]

    @property
    def piano_drums(self):
        """The associated piano/drums track

        Returns:
            * Track

        """
        return self.tracks[self.mtrack_id + "_piano_drums"]

    @property
    def sax(self):
        """The associated sax tracks (1-5)

        Returns:
            * (list, Track)

        """
        return [self.tracks["%s_sax_%d" % (self.mtrack_id, n)] for n in [1, 2, 3, 4, 5]]

[docs]    def to_jams(self):
        """Jams: the track's data in jams format"""
        return self.annotation


[docs]@io.coerce_to_bytes_io
def load_audio(fhandle: BinaryIO) -> Tuple[np.ndarray, float]:
    """Load a Filosax audio file.

    Args:
        fhandle (str or file-like): path or file-like object pointing to an audio file

    Returns:
        * np.ndarray - the audio signal
        * float - The sample rate of the audio file

    """
    return librosa.load(fhandle, sr=None, mono=True)


[docs]@io.coerce_to_string_io
def load_annotation(fhandle: TextIO) -> List[Note]:
    """Load a Filosax annotation file.

    Args:
        fhandle (str or file-like): path or file-like object pointing to an audio file

    Returns:
        * (list, Note): an ordered list of Note objects

    """
    note_dict = json.load(fhandle)["notes"]
    return [Note(n) for n in note_dict]


[docs]@core.docstring_inherit(core.Dataset)
class Dataset(core.Dataset):
    """
    The Filosax dataset
    """

    def __init__(self, data_home=None, version="default"):
        super().__init__(
            data_home,
            version,
            name="filosax",
            track_class=Track,
            multitrack_class=MultiTrack,
            bibtex=BIBTEX,
            indexes=INDEXES,
            download_info=DOWNLOAD_INFO,
            license_info=LICENSE_INFO,
        )