Source code for esdl.cube

import math
import os
from datetime import datetime, timedelta

import warnings
import netCDF4
import numpy as np
import esdl
import xarray as xr
import esdl.util
import zarr
from numcodecs import Blosc
from .cube_config import CubeConfig, CUBE_CHANGELOG
# from .cube_provider import CubeSourceProvider
from .version import version as __version__

[docs]class Cube: """ Represents a data cube. Use the static **open()** or **create()** methods to obtain data cube objects. """ def __init__(self, base_dir, config): self._base_dir = base_dir self._config = config self._closed = False self._data = None def __repr__(self) -> str: return 'Cube(%s, \'%s\')' % (self._config, self._base_dir) @property def base_dir(self) -> str: """ The cube's base directory. """ return self._base_dir @property def config(self) -> CubeConfig: """ The cube's configuration. See CubeConfig class. """ return self._config
[docs] def info(self) -> str: """ Return a human-readable information string about this data cube (markdown formatted). """ # TODO (forman, 20151104): read from data cube's dir, see issue #5 return ''
@property def closed(self): """ Checks if the cube has been closed. """ return self._closed @property def data(self): """ The cube's data represented as an xarray dataset """ if not self._data: self._data = xr.open_zarr(self.base_dir) return self._data
[docs] @staticmethod def open(base_dir): """ Open an existing data cube. Use the **Cube.update(provider)** method to add data to the cube via a source data provider. :param base_dir: The data cube's base directory which must be empty or non-existent. :return: A cube instance. """ if not os.path.exists(base_dir): raise IOError('data cube base directory does not exists: %s' % base_dir) zg = zarr.open_group(base_dir) # Construct a CubeConfig object from the zarr attributes. # datetime values have to be parsed config = zg.attrs['cube.config'] for k in ('start_time', 'end_time', 'ref_time'): dt = datetime.strptime(config[k], '%Y-%m-%d %H:%M:%S') config[k] = dt CubeConfig._ensure_compatible_config(config) return Cube(base_dir, CubeConfig(**config))
[docs] @staticmethod def create(base_dir, config=CubeConfig()): """ Create a new data cube. Use the **Cube.update(provider)** method to add data to the cube via a source data provider. :param base_dir: The data cube's base directory. Must not exists. :param config: The data cube's static information. :return: A cube instance. """ if os.path.exists(base_dir): raise IOError('data cube base directory exists: %s' % base_dir) num_periods_per_year = config.num_periods_per_year bndsdim = ('bnds', [0, 1]) temporal_res = config.temporal_res start_year = config.start_time.year end_year = config.end_time.year start_nums = [config.date2num(datetime(yr, 1, 1)) for yr in range(start_year, end_year)] time_bnds_attrs = { 'units': config.time_units, 'calendar': config.calendar, '_ARRAY_DIMENSIONS': ['time', 'bnds'] } def correctub(sn, temporal_res, i, ntot): if i == (ntot-1): return sn+366 else: return sn + temporal_res * (i + 1.0) lower_bounds = [sn + temporal_res * (i + 0.0) for sn in start_nums for i in range(num_periods_per_year)] upper_bounds = [correctub(sn, temporal_res, i, num_periods_per_year) for sn in start_nums for i in range(num_periods_per_year)] time_bnds_vals = np.zeros((len(lower_bounds), 2)) time_bnds_vals[:, 0] = lower_bounds time_bnds_vals[:, 1] = upper_bounds time_attrs = { 'long_name': 'time', 'standard_name': 'time', 'units': config.time_units, 'calendar': config.calendar, 'bounds': 'time_bnds', '_ARRAY_DIMENSIONS': ['time'], } time_vals = [sn + temporal_res * (i + 0.5) for sn in start_nums for i in range(num_periods_per_year)] # times[-1] = var_time_bnds[-1,0] + (var_time_bnds[-1,1] - var_time_bnds[-1,0]) / 2. # Thus, we keep date of the last time range always at Julian day 364, not in the center of the period. # The time bounds then specify the real extent of the period. # Uncomment the upper line to center it between the upper and lower bound! lon_attrs = { 'long_name': 'longitude', 'standard_name': 'longitude', 'units': 'degrees_east', 'bounds': 'lon_bnds', '_ARRAY_DIMENSIONS': ['lon'], } lon_bnds_attrs = {'units': 'degrees_east', '_ARRAY_DIMENSIONS': ['lon', 'bnds']} lat_attrs = { 'long_name': 'latitude', 'standard_name': 'latitude', 'units': 'degrees_north', 'bounds': 'lat_bnds', '_ARRAY_DIMENSIONS': ['lat'], } lat_bnds_attrs = {'units': 'degrees_north', '_ARRAY_DIMENSIONS': ['lat', 'bnds']} lon_vals = np.zeros(config.grid_width) lon_bnds_vals = np.zeros((config.grid_width, 2)) lon_breakpoints = np.linspace(config.lon0, config.lon1, config.grid_width+1, True) lon_bnds_vals[:, 0] = lon_breakpoints[0:config.grid_width] lon_bnds_vals[:, 1] = lon_breakpoints[1:config.grid_width+1] lon_vals = (lon_bnds_vals[:, 0] + lon_bnds_vals[:, 1])/2 lat_vals = np.zeros(config.grid_height) lat_bnds_vals = np.zeros((config.grid_height, 2)) lat_breakpoints = np.linspace(config.lat0, config.lat1, config.grid_height+1, True) lat_bnds_vals[:, 1] = lat_breakpoints[0:config.grid_height] lat_bnds_vals[:, 0] = lat_breakpoints[1:config.grid_height+1] lat_vals = (lat_bnds_vals[:, 0] + lat_bnds_vals[:, 1])/2 ds = xr.Dataset(coords={ 'time': time_vals, 'lat': lat_vals, 'lon': lon_vals, 'time_bnds': (['time', 'bnds'], time_bnds_vals), 'lat_bnds': (['lat', 'bnds'], lat_bnds_vals), 'lon_bnds': (['lon', 'bnds'], lon_bnds_vals), }) ds.attrs['CHANGELOG'] = CUBE_CHANGELOG ds.coords['time'].attrs.update(time_attrs) ds.coords['lon'].attrs.update(lon_attrs) ds.coords['lat'].attrs.update(lat_attrs) ds.coords['time_bnds'].attrs.update(time_bnds_attrs) ds.coords['lon_bnds'].attrs.update(lon_bnds_attrs) ds.coords['lat_bnds'].attrs.update(lat_bnds_attrs) dsz = ds.to_zarr(base_dir) # Write the cubeconfig attribute directly to zarr because this is a nested # dict and not NetCDF compatible, so xarray won't accept this configdict = {} # Convert datetimes to string for JSON repr for k, v in config.__dict__.items(): if isinstance(v, datetime): configdict[k] = str(v) else: configdict[k] = v z = zarr.open_group(base_dir) z.attrs['cube.config'] = configdict return Cube(base_dir, config)
def close(self): warnings.warn( "This function is deprecated. Zarr cubes do not have to be closed.", DeprecationWarning)
[docs] def update(self, provider: 'CubeSourceProvider', image_cache_size=12): """ Updates the data cube with source data from the given image provider. :param provider: An instance of the abstract ImageProvider class """ if self._closed: raise IOError('cube has been closed') provider.prepare() try: target_start_time, target_end_time = provider.temporal_coverage except Exception as e: raise e if self._config.start_time and self._config.start_time > target_start_time: target_start_time = self._config.start_time if self._config.end_time and self._config.end_time < target_end_time: target_end_time = self._config.end_time dsgroup = zarr.open_group(self.base_dir) # Initiate zarr datasets for varname in provider.variable_descriptors: if not varname in dsgroup.array_keys(): self._init_variable_dataset(provider, varname) varnames = provider.variable_descriptors.keys() target_year_1 = target_start_time.year target_year_2 = target_end_time.year cube_temporal_res = self._config.temporal_res num_periods_per_year = self._config.num_periods_per_year datasets = {varname: dsgroup[varname] for varname in provider.variable_descriptors} imagecache = [] # Preallocate cache so that multiple images may be written at once, # This should be much faster when writing time series # Initial index inside zarr array to write to i0 = (target_year_1-self._config.start_time.year)*num_periods_per_year ilast = i0-1 for target_year in range(target_year_1, target_year_2+1): time_min = datetime(target_year, 1, 1) time_max = datetime(target_year + 1, 1, 1) d_time = timedelta(days=cube_temporal_res) time_1 = time_min for time_index in range(num_periods_per_year): time_2 = time_1 + d_time if time_2 > time_max: time_2 = time_max weight = esdl.util.temporal_weight( time_1, time_2, target_start_time, target_end_time) if weight > 0.0: var_name_to_image = provider.compute_variable_images(time_1, time_2) if var_name_to_image: imagecache.append((i0, var_name_to_image)) #print("t1: ", time_1, " t2: ", time_2) if i0-ilast >= image_cache_size: #print("i0 ilast", i0, ilast) if len(imagecache) > 0: # print(datasets) self._write_images(provider, datasets, imagecache, varnames, image_cache_size) imagecache = [] ilast = i0 time_1 = time_2 i0 = i0 + 1 if len(imagecache) > 0: self._write_images(provider, datasets, imagecache, varnames, image_cache_size) provider.close()
def _write_images(self, provider, datasets, imagecache, varnames, image_cache_size): # First we unpack the data into 3d-numpy arrays for var_name in varnames: ds = datasets[var_name] i0 = imagecache[0][0] iend = imagecache[-1][0] + 1 im_first = imagecache[0][1][var_name] image_all = np.full( (iend-i0, im_first.shape[0], im_first.shape[1]), ds.fill_value, ds.dtype) for entry in imagecache: i1 = entry[0] im_now = entry[1][var_name] image_all[i1-i0, :, :] = im_now print("Writing variable %s image from time index %d to %d" % (var_name, i0, iend)) ds[i0:iend, :, :] = image_all def _init_variable_dataset(self, provider, variable_name): import time zdataset = zarr.open_group(self.base_dir) nt = len(zdataset['time']) nx = len(zdataset['lon']) ny = len(zdataset['lat']) if not self._config.chunk_sizes: cs = (1, ny, nx) else: cs = self._config.chunk_sizes if self._config.compression: compressor = Blosc(cname='lz4', clevel=self._config.comp_level) else: compressor = None variable_descriptors = provider.variable_descriptors variable_attributes = variable_descriptors[variable_name] newds = zdataset.create_dataset(variable_name, shape=(nt, ny, nx), chunks=cs, dtype=variable_attributes['data_type'], compressor=compressor, fillvalue=variable_attributes['fill_value']) # TODO (forman, 20160512): some of these attributes could be read from cube configuration # see # see # # check (nf 20151023) - add more global attributes from CF-conventions here, # especially those that reference original sources and originators # # dataset.title = ... # dataset.references = ... # dataset.comment = ... source_attributes = {k: v for k, v in variable_attributes.items( ) if k not in {'data_type', 'fill_value', 'scale_factor', 'add_offset'}} all_attributes = { 'Conventions': 'CF-1.6', 'institution': 'Brockmann Consult GmbH, Germany', 'source': 'ESDL data cube generation, version %s' % __version__, 'history': time.ctime(time.time()) + ' - ESDL data cube generation', 'northing': '%s degrees' % self.config.northing, 'easting': '%s degrees' % self.config.easting, 'source_attributes': source_attributes, # This is actually mangling with xarray-internals to tell xarray about the dimension variables # This will have to change in the future, once there is something like a NetZDF zarr extension # defined, see e.g. issue '_ARRAY_DIMENSIONS': ["time", "lat", "lon"] } for k in all_attributes: newds.attrs[k] = all_attributes[k] return newds @staticmethod def _get_num_steps(x1, x2, dx): return int(math.floor((x2 - x1) / dx))