# -*- coding: utf-8 -*-
"""Utility functions for mirdata
Attributes:
MIR_DATASETS_DIR (str): home folder for MIR datasets
NoteData (namedtuple): `intervals`, `notes`, `confidence`
F0Data (namedtuple): `times`, `frequencies`, `confidence`
LyricData (namedtuple): `start_times`, `end_times`, `lyrics`, `pronounciations`
SectionData (namedtuple): `start_times`, `end_times`, `sections`
BeatData (namedtuple): `beat_times`, `beat_positions`
ChordData (namedtuple): `start_times`, `end_times`, `chords`
KeyData (namedtuple): `start_times`, '`end_times`, `keys`
EventData (namedtuple): `start_times`, `end_times`, `event`
TempoData (namedtuple): `time`, `duration`, `value`, `confidence`
"""
from collections import namedtuple
import hashlib
import os
import json
MIR_DATASETS_DIR = os.path.join(os.getenv('HOME', '/tmp'), 'mir_datasets')
[docs]def md5(file_path):
"""Get md5 hash of a file.
Args:
file_path (str): File path
Returns:
md5_hash (str): md5 hash of data in file_path
"""
hash_md5 = hashlib.md5()
with open(file_path, 'rb') as fhandle:
for chunk in iter(lambda: fhandle.read(4096), b''):
hash_md5.update(chunk)
return hash_md5.hexdigest()
[docs]def none_path_join(partial_path_list):
"""Join a list of partial paths. If any part of the path is None,
returns None.
Args:
partial_path_list (list): List of partial paths
Returns:
path or None (str or None): joined path string or None
"""
if None in partial_path_list:
return None
else:
return os.path.join(*partial_path_list)
[docs]def log_message(message, silence=False):
"""Helper function to log message
Args:
message (str): message to log
silence (bool): if true, the message is not logged
"""
if not silence:
print(message)
[docs]def check_index(dataset_index, data_home):
"""check index to find out missing files and files with invalid checksum
Args:
dataset_index (list): dataset indices
data_home (str): Local home path that the dataset is being stored
Returns:
missing_files (list): List of file paths that are in the dataset index
but missing locally
invalid_checksums (list): List of file paths that file exists in the dataset
index but has a different checksum compare to the reference checksum
"""
missing_files = {}
invalid_checksums = {}
# loop over track ids
for track_id, track in dataset_index.items():
# loop over each data file for this track id
for key in track.keys():
filepath = track[key][0]
checksum = track[key][1]
if filepath is not None:
local_path = os.path.join(data_home, filepath)
# validate that the file exists on disk
if not os.path.exists(local_path):
if track_id not in missing_files.keys():
missing_files[track_id] = []
missing_files[track_id].append(local_path)
# validate that the checksum matches
elif md5(local_path) != checksum:
if track_id not in invalid_checksums.keys():
invalid_checksums[track_id] = []
invalid_checksums[track_id].append(local_path)
return missing_files, invalid_checksums
[docs]def validator(dataset_index, data_home, silence=False):
"""Checks the existence and validity of files stored locally with
respect to the paths and file checksums stored in the reference index.
Logs invalid checksums and missing files.
Args:
dataset_index (list): dataset indices
data_home (str): Local home path that the dataset is being stored
silence (bool): if False (default), prints missing and invalid files
to stdout. Otherwise, this function is equivalent to check_index.
Returns:
missing_files (list): List of file paths that are in the dataset index
but missing locally.
invalid_checksums (list): List of file paths that file exists in the
dataset index but has a different checksum compare to the reference
checksum.
"""
missing_files, invalid_checksums = check_index(dataset_index, data_home)
# print path of any missing files
has_any_missing_file = False
for track_id in missing_files.keys():
if len(missing_files[track_id]) > 0:
log_message('Files missing for {}:'.format(track_id), silence)
for fpath in missing_files[track_id]:
log_message(fpath, silence)
log_message('-' * 20, silence)
has_any_missing_file = True
# print path of any invalid checksums
has_any_invalid_checksum = False
for track_id in invalid_checksums.keys():
if len(invalid_checksums[track_id]) > 0:
log_message('Invalid checksums for {}:'.format(track_id), silence)
for fpath in invalid_checksums[track_id]:
log_message(fpath, silence)
log_message('-' * 20, silence)
has_any_invalid_checksum = True
if not (has_any_missing_file or has_any_invalid_checksum):
log_message(
'Success: the dataset is complete and all files are valid.', silence
)
log_message('-' * 20, silence)
return missing_files, invalid_checksums
NoteData = namedtuple('NoteData', ['intervals', 'notes', 'confidence'])
F0Data = namedtuple('F0Data', ['times', 'frequencies', 'confidence'])
MultipitchData = namedtuple(
'MultipitchData', ['times', 'frequency_list', 'confidence_list']
)
LyricData = namedtuple(
'LyricData', ['start_times', 'end_times', 'lyrics', 'pronunciations']
)
SectionData = namedtuple('SectionData', ['intervals', 'labels'])
BeatData = namedtuple('BeatData', ['beat_times', 'beat_positions'])
ChordData = namedtuple('ChordData', ['intervals', 'labels'])
KeyData = namedtuple('KeyData', ['start_times', 'end_times', 'keys'])
TempoData = namedtuple('TempoData', ['time', 'duration', 'value', 'confidence'])
EventData = namedtuple('EventData', ['start_times', 'end_times', 'event'])
[docs]def get_default_dataset_path(dataset_name):
"""Get the default path for a dataset given it's name
Args:
dataset_name (str or None)
The name of the dataset folder, e.g. 'Orchset'
Returns:
save_path (str): Local path to the dataset
"""
return os.path.join(MIR_DATASETS_DIR, dataset_name)
def load_json_index(filename):
CWD = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(CWD, 'indexes', filename)) as f:
return json.load(f)
[docs]class cached_property(object):
"""A property that is only computed once per instance and then replaces
itself with an ordinary attribute. Deleting the attribute resets the
property.
Source: https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76
"""
def __init__(self, func):
self.__doc__ = getattr(func, "__doc__")
self.func = func
def __get__(self, obj, cls):
# type: (Any, type) -> Any
if obj is None:
return self
value = obj.__dict__[self.func.__name__] = self.func(obj)
return value
class LargeData(object):
def __init__(self, index_file, metadata_load_fn=None):
"""Object which loads and caches large data the first time it's
accessed.
Parameters
----------
index_file: str
File name of checksum index file to be passed to `load_json_index`
metadata_load_fn: function
Function which returns a metadata dictionary.
If None, assume the dataset has no metadata. When the
`metadata` attribute is called, raises a NotImplementedError
"""
self._metadata = None
self.index_file = index_file
self.metadata_load_fn = metadata_load_fn
@cached_property
def index(self):
return load_json_index(self.index_file)
def metadata(self, data_home):
if self.metadata_load_fn is None:
raise NotImplementedError
if self._metadata is None or self._metadata['data_home'] != data_home:
self._metadata = self.metadata_load_fn(data_home)
return self._metadata