Source code for mirdata.validate

"""Utility functions for mirdata"""

import hashlib
import logging
import os
import tqdm

from smart_open import open



[docs]
def md5(file_path):
    """Get md5 hash of a file.

    Args:
        file_path (str): File path

    Returns:
        str: md5 hash of data in file_path

    """
    hash_md5 = hashlib.md5()
    with open(file_path, "rb", compression="disable") as fhandle:
        for chunk in iter(lambda: fhandle.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()




[docs]
def log_message(message, verbose=True):
    """Helper function to log message

    Args:
        message (str): message to log
        verbose (bool): if false, the message is not logged

    """
    if verbose:
        logging.info(message)




[docs]
def validate(local_path, checksum):
    """Validate that a file exists and has the correct checksum

    Args:
        local_path (str): file path
        checksum (str): md5 checksum

    Returns:
        * bool - True if file exists
        * bool - True if checksum matches

    """
    # validate that the file exists on disk
    try:
        with open(local_path):
            pass
    except IOError:
        return False, False

    # validate that the checksum matches
    if md5(local_path) != checksum:
        valid = False
    else:
        valid = True

    return True, valid




[docs]
def validate_files(file_dict, data_home, verbose):
    """Validate files

    Args:
        file_dict (dict): dictionary of file information
        data_home (str): path where the data lives
        verbose (bool): if True, show progress

    Returns:
        * dict - missing files
        * dict - files with invalid checksums

    """
    missing = {}
    invalid = {}
    for file_id, file in tqdm.tqdm(file_dict.items(), disable=not verbose):
        for tracks in file.keys():
            # multitrack case
            if tracks == "tracks":
                continue
            # tracks
            else:
                filepath = file[tracks][0]
                checksum = file[tracks][1]
                if filepath is not None:
                    local_path = os.path.join(data_home, filepath)
                    exists, valid = validate(local_path, checksum)
                    if not exists:
                        if file_id not in missing.keys():
                            missing[file_id] = []
                        missing[file_id].append(local_path)
                    elif not valid:
                        if file_id not in invalid.keys():
                            invalid[file_id] = []
                        invalid[file_id].append(local_path)

    return missing, invalid




[docs]
def validate_metadata(file_dict, data_home, verbose):
    """Validate files

    Args:
        file_dict (dict): dictionary of file information
        data_home (str): path where the data lives
        verbose (bool): if True, show progress

    Returns:
        * dict - missing files
        * dict - files with invalid checksums

    """
    missing = {}
    invalid = {}
    for file_id, file in tqdm.tqdm(file_dict.items(), disable=not verbose):
        filepath = file[0]
        checksum = file[1]
        if filepath is not None:
            local_path = os.path.join(data_home, filepath)
            exists, valid = validate(local_path, checksum)
            if not exists:
                if file_id not in missing.keys():
                    missing[file_id] = []
                missing[file_id].append(local_path)
            elif not valid:
                if file_id not in invalid.keys():
                    invalid[file_id] = []
                invalid[file_id].append(local_path)

    return missing, invalid




[docs]
def validate_index(dataset_index, data_home, verbose=True):
    """Validate files in a dataset's index

    Args:
        dataset_index (list): dataset indices
        data_home (str): Local home path that the dataset is being stored
        verbose (bool): if true, prints validation status while running

    Returns:
        * dict - file paths that are in the index but missing locally
        * dict - file paths with differing checksums

    """
    missing_files = {}
    invalid_checksums = {}

    # check index
    if "metadata" in dataset_index and dataset_index["metadata"] is not None:
        missing_metadata, invalid_metadata = validate_metadata(
            dataset_index["metadata"], data_home, verbose
        )
        missing_files["metadata"] = missing_metadata
        invalid_checksums["metadata"] = invalid_metadata

    if "tracks" in dataset_index and dataset_index["tracks"] is not None:
        missing_tracks, invalid_tracks = validate_files(
            dataset_index["tracks"], data_home, verbose
        )
        missing_files["tracks"] = missing_tracks
        invalid_checksums["tracks"] = invalid_tracks

    if "multitracks" in dataset_index and dataset_index["multitracks"] is not None:
        missing_multitracks, invalid_multitracks = validate_files(
            dataset_index["multitracks"], data_home, verbose
        )
        missing_files["multitracks"] = missing_multitracks
        invalid_checksums["multitracks"] = invalid_multitracks

    return missing_files, invalid_checksums




[docs]
def validator(dataset_index, data_home, verbose=True):
    """Checks the existence and validity of files stored locally with
    respect to the paths and file checksums stored in the reference index.
    Logs invalid checksums and missing files.

    Args:
        dataset_index (list): dataset indices
        data_home (str): Local home path that the dataset is being stored
        verbose (bool): if True (default), prints missing and invalid files
            to stdout. Otherwise, this function is equivalent to validate_index.

    Returns:
        missing_files (list): List of file paths that are in the dataset index
            but missing locally.
        invalid_checksums (list): List of file paths that file exists in the
            dataset index but has a different checksum compare to the reference
            checksum.

    """
    missing_files, invalid_checksums = validate_index(dataset_index, data_home, verbose)

    # print path of any missing files
    has_any_missing_file = False
    for file_id in missing_files:
        if len(missing_files[file_id]) > 0:
            log_message("Files missing for {}:".format(file_id), verbose)
            for fpath in missing_files[file_id]:
                log_message(fpath, verbose)
            log_message("-" * 20, verbose)
            has_any_missing_file = True

    # print path of any invalid checksums
    has_any_invalid_checksum = False
    for file_id in invalid_checksums:
        if len(invalid_checksums[file_id]) > 0:
            log_message("Invalid checksums for {}:".format(file_id), verbose)
            for fpath in invalid_checksums[file_id]:
                log_message(fpath, verbose)
            log_message("-" * 20, verbose)
            has_any_invalid_checksum = True

    if not (has_any_missing_file or has_any_invalid_checksum):
        log_message(
            "Success: the dataset is complete and all files are valid.", verbose
        )
        log_message("-" * 20, verbose)

    return missing_files, invalid_checksums