Commit a057e6df authored by Klaus Zimmermann's avatar Klaus Zimmermann
Browse files

Improve datahandling documentation (closes #240)

parent dcd5951e
......@@ -14,10 +14,14 @@ from .dask_setup import progress
logger = logging.getLogger(__name__)
#: Constant that is used to indicate missing value.
MISSVAL = 1.0e20
def ignore_cb(cube, field, filename):
"""
Callback to ignore certain common global attributes in data files.
"""
cube.attributes.pop('creation_date', None)
cube.attributes.pop('tracking_id', None)
cube.attributes.pop('history', None)
......@@ -25,6 +29,33 @@ def ignore_cb(cube, field, filename):
def prepare_input_data(datafiles):
"""
Produce a :class:`iris.cube.CubeList` containing cubes for the given data.
This loads the data from all the given files and merges them into once cube
per variable. In the process, the following potentially conflicting global
attributes are removed: `creation_date`, `tracking_id`, `history`,
`history_of_appended_files`. If the given datafiles can not be concatenated
into a single cube per variable, the function raises a :exc:`ValueError`.
Parameters
----------
datafiles : list of string
A list of paths to datafiles.
Returns
-------
cubes : iris.cube.CubeList
A list of cubes, one per variable, referencing the corresponding data
from all the passed data files.
Raises
------
ValueError
If the given data can not be concatenated into one cube per variable.
In this case, it is advised to investigate the problem by loading the
same set of files in an interactive session with iris.
"""
datacubes = iris.load_raw(datafiles, callback=ignore_cb)
iris.util.unify_time_units(datacubes)
equalise_attributes(datacubes)
......@@ -53,6 +84,38 @@ def prepare_input_data(datafiles):
def save(result, output_filename, iterative_storage=False, client=None):
"""
Save the result cube to the given output file.
If there are outstanding computations in lazy data in the cube, this
function realizes the results, i.e. performs all outstanding computations,
loading the input data into memory. To avoid memory problems, we offer two
different approaches on how this is done:
If `iterative_storage` is `True`, first an empty cube is saved, putting all
metadata and coordinates in place, then the result is realized and stored
one timeslice at a time, sequentially. This potentially reduces
parallelism, but also reduces memory requirements. Furthermore, it means
that on unplanned termination, all finished calculations are already
stored.
If `iterative_storage` is `False`, the complete result is realized first,
maximizing the parallel use of the cluster as exposed by `client`, but
potentially leading to memory problems if there are large intermediate
results. This also means that all results are lost in the case of unplanned
termination.
Parameters
----------
result : iris.cube.Cube
The iris cube to be saved.
output_filename : string
The filename of the output. Must refer to a netCDF4 file.
iterative_storage : bool
Wheter to perform iterative storage (see above).
client : distributed.Client
The :class:`distributed.Client` object giving access to the cluster.
"""
data = result.core_data().rechunk()
if iterative_storage:
logger.info('Storing iteratively')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment