Source code for mirdata.utils

# -*- coding: utf-8 -*-
"""Utility functions for mirdata

Attributes:
    MIR_DATASETS_DIR (str): home folder for MIR datasets

    NoteData (namedtuple): `intervals`, `notes`, `confidence`

    F0Data (namedtuple): `times`, `frequencies`, `confidence`

    LyricData (namedtuple): `start_times`, `end_times`, `lyrics`, `pronounciations`

    SectionData (namedtuple): `start_times`, `end_times`, `sections`

    BeatData (namedtuple): `beat_times`, `beat_positions`

    ChordData (namedtuple): `start_times`, `end_times`, `chords`

    KeyData (namedtuple): `start_times`, '`end_times`, `keys`

    EventData (namedtuple): `start_times`, `end_times`, `event`

    TempoData (namedtuple): `time`, `duration`, `value`, `confidence`

"""


from collections import namedtuple
import hashlib
import os
import json


MIR_DATASETS_DIR = os.path.join(os.getenv('HOME', '/tmp'), 'mir_datasets')


[docs]def md5(file_path): """Get md5 hash of a file. Args: file_path (str): File path Returns: md5_hash (str): md5 hash of data in file_path """ hash_md5 = hashlib.md5() with open(file_path, 'rb') as fhandle: for chunk in iter(lambda: fhandle.read(4096), b''): hash_md5.update(chunk) return hash_md5.hexdigest()
[docs]def none_path_join(partial_path_list): """Join a list of partial paths. If any part of the path is None, returns None. Args: partial_path_list (list): List of partial paths Returns: path or None (str or None): joined path string or None """ if None in partial_path_list: return None else: return os.path.join(*partial_path_list)
[docs]def log_message(message, silence=False): """Helper function to log message Args: message (str): message to log silence (bool): if true, the message is not logged """ if not silence: print(message)
[docs]def check_index(dataset_index, data_home): """check index to find out missing files and files with invalid checksum Args: dataset_index (list): dataset indices data_home (str): Local home path that the dataset is being stored Returns: missing_files (list): List of file paths that are in the dataset index but missing locally invalid_checksums (list): List of file paths that file exists in the dataset index but has a different checksum compare to the reference checksum """ missing_files = {} invalid_checksums = {} # loop over track ids for track_id, track in dataset_index.items(): # loop over each data file for this track id for key in track.keys(): filepath = track[key][0] checksum = track[key][1] if filepath is not None: local_path = os.path.join(data_home, filepath) # validate that the file exists on disk if not os.path.exists(local_path): if track_id not in missing_files.keys(): missing_files[track_id] = [] missing_files[track_id].append(local_path) # validate that the checksum matches elif md5(local_path) != checksum: if track_id not in invalid_checksums.keys(): invalid_checksums[track_id] = [] invalid_checksums[track_id].append(local_path) return missing_files, invalid_checksums
[docs]def validator(dataset_index, data_home, silence=False): """Checks the existence and validity of files stored locally with respect to the paths and file checksums stored in the reference index. Logs invalid checksums and missing files. Args: dataset_index (list): dataset indices data_home (str): Local home path that the dataset is being stored silence (bool): if False (default), prints missing and invalid files to stdout. Otherwise, this function is equivalent to check_index. Returns: missing_files (list): List of file paths that are in the dataset index but missing locally. invalid_checksums (list): List of file paths that file exists in the dataset index but has a different checksum compare to the reference checksum. """ missing_files, invalid_checksums = check_index(dataset_index, data_home) # print path of any missing files has_any_missing_file = False for track_id in missing_files.keys(): if len(missing_files[track_id]) > 0: log_message('Files missing for {}:'.format(track_id), silence) for fpath in missing_files[track_id]: log_message(fpath, silence) log_message('-' * 20, silence) has_any_missing_file = True # print path of any invalid checksums has_any_invalid_checksum = False for track_id in invalid_checksums.keys(): if len(invalid_checksums[track_id]) > 0: log_message('Invalid checksums for {}:'.format(track_id), silence) for fpath in invalid_checksums[track_id]: log_message(fpath, silence) log_message('-' * 20, silence) has_any_invalid_checksum = True if not (has_any_missing_file or has_any_invalid_checksum): log_message( 'Success: the dataset is complete and all files are valid.', silence ) log_message('-' * 20, silence) return missing_files, invalid_checksums
NoteData = namedtuple('NoteData', ['intervals', 'notes', 'confidence']) F0Data = namedtuple('F0Data', ['times', 'frequencies', 'confidence']) MultipitchData = namedtuple( 'MultipitchData', ['times', 'frequency_list', 'confidence_list'] ) LyricData = namedtuple( 'LyricData', ['start_times', 'end_times', 'lyrics', 'pronunciations'] ) SectionData = namedtuple('SectionData', ['intervals', 'labels']) BeatData = namedtuple('BeatData', ['beat_times', 'beat_positions']) ChordData = namedtuple('ChordData', ['intervals', 'labels']) KeyData = namedtuple('KeyData', ['start_times', 'end_times', 'keys']) TempoData = namedtuple('TempoData', ['time', 'duration', 'value', 'confidence']) EventData = namedtuple('EventData', ['start_times', 'end_times', 'event'])
[docs]def get_default_dataset_path(dataset_name): """Get the default path for a dataset given it's name Args: dataset_name (str or None) The name of the dataset folder, e.g. 'Orchset' Returns: save_path (str): Local path to the dataset """ return os.path.join(MIR_DATASETS_DIR, dataset_name)
def load_json_index(filename): CWD = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(CWD, 'indexes', filename)) as f: return json.load(f)
[docs]class cached_property(object): """A property that is only computed once per instance and then replaces itself with an ordinary attribute. Deleting the attribute resets the property. Source: https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76 """ def __init__(self, func): self.__doc__ = getattr(func, "__doc__") self.func = func def __get__(self, obj, cls): # type: (Any, type) -> Any if obj is None: return self value = obj.__dict__[self.func.__name__] = self.func(obj) return value
class LargeData(object): def __init__(self, index_file, metadata_load_fn=None): """Object which loads and caches large data the first time it's accessed. Parameters ---------- index_file: str File name of checksum index file to be passed to `load_json_index` metadata_load_fn: function Function which returns a metadata dictionary. If None, assume the dataset has no metadata. When the `metadata` attribute is called, raises a NotImplementedError """ self._metadata = None self.index_file = index_file self.metadata_load_fn = metadata_load_fn @cached_property def index(self): return load_json_index(self.index_file) def metadata(self, data_home): if self.metadata_load_fn is None: raise NotImplementedError if self._metadata is None or self._metadata['data_home'] != data_home: self._metadata = self.metadata_load_fn(data_home) return self._metadata