Source code for gcpy.examples.working_with_files.concatenate_files

#!/usr/bin/env python
"""
This Python script concatenates several individual netCDF files
into a single netCDF file using xarray.

Examples
--------

1. Copy this file to a different folder and navigate to that folder.
2. In your copy, edit the file names for your use case.
3. Run the following commands:

.. code-block:: console

   $ conda activate gcpy_env
   (gcpy_env) $ ./concatentate_files.py

Notes
-----

If you have several individual files with one variable per file,
you should consider concatenating them into a single file.
This is often more efficient, as opening each netCDF file incurs
computational overhead.  It is usually faster to read data from
a file with multiple variables than to having to open several
files with one variable each.
"""

# Imports
import os
import warnings
import numpy as np
import xarray as xr
from xarray.coding.variables import SerializationWarning
from gcpy import constants

# Suppress harmless run-time warnings (mostly about underflow or NaNs)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=SerializationWarning)



[docs]
def find_files_in_dir(path, substrs):
    '''
    Returns a list of all files in a directory that match one or more
    substrings.

    Parameters
    ----------
    path : str
        Path to the directory in which to search for files.
    substrs : list of str
        List of substrings used in the search for files.

    Returns
    -------
    file_list : list of str
        List of files in the directory (specified by path)
        that match all substrings (specified in substrs).
    '''

    # Initialize
    file_list = []

    # Walk through the given data directory.  Then for each file found,
    # add it to file_list if it matches text in search_list.
    for root, _, files in os.walk(path):
        for file_name in files:
            for sub_str in substrs:
                if sub_str in file_name:
                    file_list.append(os.path.join(root, file_name))

    # Return an alphabetically sorted list of files
    file_list.sort()
    return file_list




[docs]
def replace_nans_with_zeroes(dset, verbose=True):
    '''
    Replaces NaN values with zeroes for each variable
    within an an xarray Dataset.

    Parameters
    ----------
    dset : xarray Dataset
        The input dataset, containing one or more data variables.
    verbose : bool, optional
        Set this switch to print out the variable name, as well
        as the min and max of the variable.  This will illustrate
        the replacement of NaNs with zeroes.
    '''

    # Keep all netCDF attributes
    with xr.set_options(keep_attrs=True):

        # Loop over all variables in the Dataset
        for var in dset.data_vars.keys():

            # OPTIONAL STEP:
            # Xarray will try convert missing values to NaN's,
            # so you may need to replace these with zeroes.
            #
            # If your netCDF files represent e.g. emissions,
            # or other physical quantities, you may want to
            # replace these with zeros, so that NaNs won't
            # get read into atmospheric models, etc.
            #
            # NOTE: dset[v].values converts to a numpy ndarray,
            # so that you can use numpy functions.
            dset[var].where(
                np.isnan(dset[var].values),
                other=0.0,
                drop=False
            )

            # OPTIONAL: Print min & max for each variable
            # Comment out if you wish
            if verbose:
                print(f"{var} : {np.min(dset[var].values)} {np.max(dset[var].values)}")

    # Return the modified Datast
    return dset




[docs]
def main():
    '''
    Main program.
    '''

    # File path containing data files
    # (YOU CAN EDIT THIS)
    path_to_dir = '/path/to/my/netcdf/files/'

    # List of search strings that each file must contain
    # (YOU CAN EDIT THIS)
    substrs = ['SpeciesConc']

    # Get a list of variables that GCPy should not read.
    # These are mostly variables introduced into GCHP with the MAPL v1.0.0
    # update.  These variables contain either repeated or non-standard
    # dimensions that can cause problems in xarray when combining datasets.
    skip_vars = constants.SKIP_THESE_VARS

    # Look for all the netCDF files in the path
    file_list = find_files_in_dir(path_to_dir, substrs)

    # Return a single xarray Dataset containing data from all files
    # NOTE: Need to add combine="nested" for xarray 0.15 and higher
    var = xr.__version__.split(".")
    if int(var[0]) == 0 and int(var[1]) >= 15:
        dset = xr.open_mfdataset(file_list,
                                 drop_variables=skip_vars,
                                 combine="nested")
    else:
        dset = xr.open_mfdataset(file_list,
                                 drop_variables=skip_vars)

    # Replace NaN values with zeroes
    dset = replace_nans_with_zeroes(dset, verbose=True)

    # Specify the path and filename for the concatenated data
    # (YOU CAN EDIT THIS)
    outdir = '/path/to/my/output/file'
    outfile = os.path.join(outdir, 'my_concatenated_output_file.nc')

    # Write concatenated data to a netCDF file
    dset.to_netcdf(outfile)



# Only execute when running as a standalone script
if __name__ == "__main__":
    main()