Source code for toolbox_scs.detectors.dssc_data

import os
import logging

import h5py
import xarray as xr

from ..util.exceptions import ToolBoxFileError

__all__ = [
    'get_data_formatted',
    'load_xarray',
    'save_attributes_h5',
    'save_xarray',
]

log = logging.getLogger(__name__)


def _to_netcdf(fname, data, group, mode):
    f_exists = os.path.isfile(fname)
    if (f_exists and mode == 'w'):
        data.to_netcdf(fname, group=group, mode='w', engine='h5netcdf')
        log.warning(f"File {fname} existed: overwritten")
        log.info(f"Stored data in file {fname}")
    elif f_exists and mode == 'a':
        try:
            data.to_netcdf(fname, group=group, mode='a', engine='h5netcdf')
            log.info(f"Created group {group} in file {fname}")
        except (ValueError, TypeError):
            msg = f"Group {group} exists and has incompatible dimensions."
            log.warning(f"Could not store data: {msg}")
            raise ToolBoxFileError(msg, fname)
    else:
        data.to_netcdf(fname, group=group, mode='w', engine='h5netcdf')
        log.info(f"Stored data in file {fname}")


[docs]def save_xarray(fname, data, group='data', mode='a'): """ Store xarray Dataset in the specified location Parameters ---------- data: xarray.DataSet The data to be stored fname: str, int filename overwrite: bool overwrite existing data Raises ------ ToolBoxFileError: Exception File existed, but overwrite was set to False. """ try: _to_netcdf(fname, data, group, mode) except ToolBoxFileError as err: raise err
[docs]def save_attributes_h5(fname, data={}): """ Adding attributes to a hdf5 file. This function is intended to be used to attach metadata to a processed run. Parameters ---------- fname: str filename as string data: dictionary the data that should be added to the file in form of a dictionary. """ f = h5py.File(fname, mode='a') for d in data.keys(): f.attrs[d] = data[d] f.close() log.info(f"added attributes to file {fname}")
[docs]def load_xarray(fname, group='data', form='dataset'): """ Load stored xarray Dataset. Comment: This function exists because of a problem with the standard netcdf engine that is malfunctioning due to related software installed in the exfel-python environment. May be dropped at some point. Parameters ---------- fname: str filename as string group: str the name of the xarray dataset (group in h5 file). form: str specify whether the data to be loaded is a 'dataset' or a 'array'. """ f_exists = os.path.isfile(fname) if f_exists: if form == 'dataset': log.debug(f'open xarray dataset {fname}') return xr.load_dataset(fname, group=group, engine='h5netcdf') elif form == 'array': log.debug(f'open xarray dataarray {fname}') return xr.load_dataarray(fname, group=group, engine='h5netcdf') else: msg = "File does not exists." raise ToolBoxFileError(msg, fname)
def _data_from_list(filenames): """ Helper function for data formatting routines. Loads the specified files given by their names. This subroutine expects the name of the group to be 'data'. Parameters ---------- filenames: list list of valid xarray filenames Returns ------- data: list a list containing the loaded data Raises ------ ToolBoxFileError raises ToolBoxFileError in case file does not exist. """ data = [] for name in filenames: f_exists = os.path.isfile(name) if f_exists: data.append(load_xarray(name, group='data')) else: msg = "File does not exists." raise ToolBoxFileError(msg, name) return data
[docs]def get_data_formatted(filenames=[], data_list=[]): """ Combines the given data into one dataset. For any of extra_data's data types, an xarray.Dataset is returned. The data is sorted along the 'module' dimension. The array dimension have the order 'trainId', 'pulse', 'module', 'x', 'y'. This order is required by the extra_geometry package. Parameters ---------- filenames: list of str files to be combined as a list of names. Calls '_data_from_list' to actually load the data. data_list: list list containing the already loaded data Returns ------- data: xarray.Dataset A xarray.Dataset containing the combined data. """ if any(filenames) is True: data = _data_from_list(filenames) elif any(data_list) is True: data = data_list mod_list = [] for d in data: if 'module' in d.attrs.keys(): mod_list.append(d.attrs['module']) if type(data[0]).__module__ == 'xarray.core.dataset': data = xr.concat(data, dim='module') elif type(data[0]).__module__ == 'pandas.core.frame': pass elif type(data[0]).__module__ == 'dask.dataframe.core': pass if mod_list is not None: data = data.assign_coords(module=("module", mod_list)) data = data.sortby("module") data.attrs.clear() return data.transpose('trainId', 'pulse', 'module', 'x', 'y')
def search_files(run_folder): """ Search folder for h5 files. Parameters ---------- run_folder: str the path to a folder containing h5 files. Returns ------- a list of the filenames of all .h5 files in the given folder. Raises ------ ToolBoxFileError: Exception raises ToolBoxFileError in case there are no .h5 files in the folder, or the folder does not exist. """ try: filenames = os.listdir(run_folder) return [run_folder+name for name in filenames if ".h5" in name] except: msg = "No files in folder" raise ToolBoxFileError(msg, run_folder)