Source code for gcpy.benchmark.modules.benchmark_funcs

#!/usr/bin/env python3
"""
Specific utilities for creating plots from GEOS-Chem benchmark simulations.
"""
import os
import warnings
import itertools
import gc
import numpy as np
import pandas as pd
import xarray as xr
import sparselt.esmf
import sparselt.xr
from joblib import Parallel, delayed
from tabulate import tabulate
from gcpy.regrid import create_regridders
from gcpy.grid import get_troposphere_mask
from gcpy.util import \
    add_bookmarks_to_pdf, add_missing_variables, add_nested_bookmarks_to_pdf, \
    array_equals, compare_varnames, create_blank_dataarray, dataset_reader, \
    dataset_mean, divide_dataset_by_dataarray, get_area_from_dataset, \
    get_emissions_varnames, get_filepath, get_variables_from_dataset, \
    insert_text_into_file, make_directory, print_totals, read_config_file, \
    read_species_metadata, rename_and_flip_gchp_rst_vars, \
    replace_whitespace, unique_values, verify_variable_type, wrap_text
from gcpy.units import convert_units
from gcpy.constants import \
    COL_WIDTH, ENCODING, MW_AIR_g, SKIP_THESE_VARS, TABLE_WIDTH
from gcpy.plot.compare_single_level import compare_single_level
from gcpy.plot.compare_zonal_mean import compare_zonal_mean
from gcpy.benchmark.modules.benchmark_utils import \
    AOD_SPC, EMISSION_SPC, EMISSION_INV, add_lumped_species_to_dataset, \
    archive_lumped_species_definitions, get_species_categories, \
    archive_species_categories, rename_speciesconc_to_speciesconcvv

# Suppress numpy divide by zero warnings to prevent output spam
np.seterr(divide="ignore", invalid="ignore")



[docs]
def create_total_emissions_table(
        refdata,
        refstr,
        devdata,
        devstr,
        species,
        spcdb_files,
        outfilename,
        ref_interval=[2678400.0],
        dev_interval=[2678400.0],
        template="Emis{}_",
        refmetdata=None,
        devmetdata=None,
):
    """
    Creates a table of emissions totals (by sector and by inventory)
    for a list of species in contained in two data sets.  The data sets,
    which typically represent output from two different model versions,
    are usually contained in netCDF data files.

    Parameters
    ----------
    refdata : xr.Dataset
        The first data set to be compared (aka "Reference" or "Ref").
    refstr : str
        A string that can be used to identify refdata
        (e.g. a model version number or other identifier).
    devdata : xr.Dataset
        The second data set to be compared (aka "Development" or "Dev").
    devstr : str
        A string that can be used to identify the data set specified
        by devfile (e.g. a model version number or other identifier).
    species : dict
        Dictionary containing the name of each species and the target
        unit that emissions will be converted to. The format of
        species is as follows:

            { species_name: target_unit", etc. }

        where "species_name" and "target_unit" are strs.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    outfilename : str
        Name of the text file which will contain the table of
        emissions totals.
    ref_interval : float, optional
        The length of the ref data interval in seconds. By default, interval
        is set to the number of seconds in a 31-day month (86400 * 31),
        which corresponds to typical benchmark simulation output.
        Default value: [2678400.0]
    dev_interval : float, optional
        The length of the dev data interval in seconds. By default, interval
        is set to the number of seconds in a 31-day month (86400 * 31),
        which corresponds to typical benchmark simulation output.
        Default value: [2678400.0]
    template : str, optional
        Template for the diagnostic names that are contained both
        "Reference" and "Development" data sets.  If not specified,
        template will be set to "Emis{}", where {} will be replaced
        by the species name.
        Default value: "Emis{}_"
    ref_area_varname : str, optional
        Name of the variable containing the grid box surface areas
        (in m2) in the ref dataset.
        Default value: 'AREA'
    dev_area_varname : str, optional
        Name of the variable containing the grid box surface areas
        (in m2) in the dev dataset.
        Default value: 'AREA'
    refmetdata : xr.Dataset, optional
        Dataset containing ref meteorology and area.
        Default value: None
    devmetdata : xr.Dataset, optional
        Dataset containing dev meteorology and area.
        Default value: None

    Notes
    -----
    This method is mainly intended for model benchmarking purposes,
    rather than as a general-purpose tool.

    Species metadata (such as molecular weights) are read from a
    YAML file called "species_database.yml".
    """

    # ==================================================================
    # Initialization
    # ==================================================================
    verify_variable_type(refdata, xr.Dataset)
    verify_variable_type(refstr, str)
    verify_variable_type(devdata, xr.Dataset)
    verify_variable_type(devstr, str)
    verify_variable_type(species, dict)
    verify_variable_type(spcdb_files, list)
    verify_variable_type(outfilename, str)

    # Get ref area [m2]
    if "AREA" in refdata.data_vars.keys():
        refarea = refdata["AREA"]
    elif refmetdata is not None:
        refarea = refmetdata["Met_AREAM2"]
    else:
        msg = "AREA variable is not in the ref Dataset and " + \
              "optional dataset containing Met_AREAM2 is not passed!"
        raise ValueError(msg)

    # Get dev area [m2]
    if "AREA" in devdata.data_vars.keys():
        devarea = devdata["AREA"]
    elif devmetdata is not None:
        devarea = devmetdata["Met_AREAM2"]
    else:
        msg = "AREA variable is not in the dev Dataset and optional " + \
              "dataset containing Met_AREAM2 is not passed!"
        raise ValueError(msg)

    # Read the species database files in the Ref & Dev rundirs,
    # and return a dict for each containing the species metadata.
    ref_metadata, dev_metadata = read_species_metadata(
        spcdb_files,
        quiet=True
    )

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # ==================================================================
    # Get the list of emission variables for which we will print totals
    # ==================================================================

    # Find all common variables between the two datasets
    # and get the lists of variables only in Ref and only in Dev,
    vardict = compare_varnames(refdata, devdata, quiet=True)
    cvars = vardict["commonvars"]
    refonly = vardict["refonly"]
    devonly = vardict["devonly"]

    # Make sure that Ref and Dev datasets have the same variables.
    # Variables that are in Ref but not in Dev will be added to Dev
    # with all missing values (NaNs). And vice-versa.
    [refdata, devdata] = add_missing_variables(refdata, devdata)

    # =================================================================
    # Open the file for output
    # =================================================================
    # Create file
    try:
        f = open(outfilename, "w", encoding=ENCODING)
    except (IOError, OSError, FileNotFoundError) as e:
        raise e(f"Could not open {outfilename} for writing!") from e

    # Write a placeholder to the file that denotes where
    # the list of species with differences will be written
    placeholder = "@%% insert diff status here %%@"
    print(f"{placeholder}\n\n", file=f)

    # Define a list for differences
    diff_list = []

    # =================================================================
    # Loop through all of the species are in species_dict
    # =================================================================
    for species_name, target_units in species.items():

        # Get a list of emission variable names for each species
        diagnostic_template = template.replace("{}", species_name)
        varnames = get_emissions_varnames(cvars, diagnostic_template)

        # Also add variables that might be in either Ref or Dev
        # but not the other.  This will allow us to print totals
        # for all species (and print NaN for the missing ones).
        if len(refonly) > 0:
            matching = [v for v in refonly if diagnostic_template in v]
            varnames = varnames + matching
        if len(devonly) > 0:
            matching = [v for v in devonly if diagnostic_template in v]
            varnames = varnames + matching

        # Sort the list again  to account for new variables added above
        varnames.sort()

        # If no emissions are found, then skip to next species
        if len(varnames) == 0:
            msg = f"No emissions found for {species_name} ... skippping"
            print(msg)
            continue

        # Check if there is a total emissions variable in the list
        vartot = [v for v in varnames if "_TOTAL" in v.upper()]

        # Push the total variable to the last list element
        # so that it will be printed last of all
        if len(vartot) == 1:
            varnames.append(varnames.pop(varnames.index(vartot[0])))

        # Title strings
        if "Inv" in template:
            print(f"Computing inventory totals for {species_name}")
            title0 = f"for inventory {species_name}"
            title1 = f"### Emissions totals {title0} [Tg]"
        else:
            print(f"Computing emissions totals for {species_name}")
            title0 = f"for species {species_name}"
            title1 = f"### Emissions totals {title0} [Tg]"

        title2 = f"### Ref = {refstr}"
        title3 = f"### Dev = {devstr}"

        # Print header to file
        print("#" * TABLE_WIDTH, file=f)
        print(f"{title1 : <{TABLE_WIDTH-3}}{'###'}", file=f)
        print(f"{title2 : <{TABLE_WIDTH-3}}{'###'}", file=f)
        print(f"{title3 : <{TABLE_WIDTH-3}}{'###'}", file=f)
        print("#" * TABLE_WIDTH, file=f)
        print(f"{'' : <{COL_WIDTH-1}}{'Ref' : >{COL_WIDTH}}{'Dev' : >{COL_WIDTH}}{'Dev - Ref' : >{COL_WIDTH}}{'% diff' : >{COL_WIDTH}} {'diffs'}", file=f)

        # =============================================================
        # Loop over all emissions variables corresponding to this
        # species and print their totals in Ref and Dev to the file.
        # =============================================================
        for v in varnames:

            if "Inv" in template:
                spc_name = v.split("_")[1]
            else:
                spc_name = species_name

            # Get metadata for the given species
            ref_species_metadata = ref_metadata.get(spc_name)
            dev_species_metadata = dev_metadata.get(spc_name)
            if ref_species_metadata is None and dev_species_metadata is None:
                print(f"No metadata found for {spc_name} ... skipping")
                continue

            # Convert units of Ref and Dev and save to numpy ndarray objects
            # (or set to NaN if the variable is not found in Ref or Dev)
            if v in refonly and v not in devonly:

                # Convert units of Ref
                refarray = convert_units(
                    refdata[v],
                    spc_name,
                    ref_species_metadata,
                    target_units,
                    interval=ref_interval,
                    area_m2=refarea,
                )

                # Set Dev to NaN (missing values) everywhere
                devarray = create_blank_dataarray(
                    name=refdata[v].name,
                    sizes=devdata.sizes,
                    coords=devdata.coords,
                    attrs=refdata[v].attrs,
                )

            elif v in devonly and v not in refonly:

                # Convert units of Dev
                devarray = convert_units(
                    devdata[v],
                    spc_name,
                    dev_species_metadata,
                    target_units,
                    interval=dev_interval,
                    area_m2=devarea,
                )

                # Set Ref to NaN (missing values) everywhere
                refarray = create_blank_dataarray(
                    name=devdata[v].name,
                    sizes=refdata.sizes,
                    coords=refdata.coords,
                    attrs=devdata[v].attrs,
                )

            else:

                # Convert units of both Ref and Dev
                refarray = convert_units(
                    refdata[v],
                    spc_name,
                    ref_species_metadata,
                    target_units,
                    interval=ref_interval,
                    area_m2=refarea,
                )
                devarray = convert_units(
                    devdata[v],
                    spc_name,
                    dev_species_metadata,
                    target_units,
                    interval=dev_interval,
                    area_m2=devarea,
                )

            # ==========================================================
            # Print emission totals for Ref and Dev
            # ==========================================================
            print_totals(
                refarray,
                devarray,
                f,
                diff_list
            )

        # Add newlines before going to the next species
        print(file=f)
        print(file=f)

    # =================================================================
    # Cleanup and quit
    # =================================================================

    # Close file
    f.close()

    # Reopen file and replace placeholder with list of diffs
    insert_text_into_file(
        filename=outfilename,
        search_text=placeholder,
        replace_text=diff_list_to_text(
            refstr,
            devstr,
            diff_list),
        width=TABLE_WIDTH
    )



[docs]
def create_global_mass_table(
        refdata,
        refstr,
        devdata,
        devstr,
        varlist,
        met_and_masks,
        spcdb_files,
        ref_hdr_label="",
        dev_hdr_label="",
        trop_only=False,
        outfilename="GlobalMass_TropStrat.txt",
        verbose=False,
):
    """
    Creates a table of global masses for a list of species in contained in
    two data sets.  The data sets, which typically represent output from two
    different model versions, are usually contained in netCDF data files.

    Parameters
    ----------
    refdata : xr.Dataset
        The first data set to be compared (aka "Reference").
    refstr : str
        A string that can be used to identify refdata
        (e.g. a model version number or other identifier).
    devdata : xr.Dataset
        The second data set to be compared (aka "Development").
    devstr : str
        A string that can be used to identify the data set specified
        by devfile (e.g. a model version number or other identifier).
    varlist : list of str
        List of species concentration variable names to include
        in the list of global totals.
    met_and_masks : dict of xr.DataArray
        Dictionary containing the meteorological variables and
        masks for the Ref and Dev datasets.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    ref_hdr_label : str, optional
        Label for Ref, placed after refstr in the file header.
        Default value: ""
    dev_hdr_label : str, optional
        Label for Dev, placed after devstr in the file header.
        Default value: ""
    trop_only : bool, optional
        Set this switch to True if you wish to print totals
        only for the troposphere.
        Default value: False (i.e. print whole-atmosphere totals).
    outfilename : str, optional
        Name of the text file which will contain the table of
        emissions totals.
        Default value: "GlobalMass_TropStrat.txt"
    verbose : bool, optional
        Set this switch to True if you wish to print out extra
        informational messages.
        Default value: False

    Notes
    -----
    This method is mainly intended for model benchmarking purposes,
    rather than as a general-purpose tool.

    Species metadata (such as molecular weights) are read from a
    YAML file called "species_database.yml".
    """

    # ==================================================================
    # Initialization
    # ==================================================================
    verify_variable_type(refdata, xr.Dataset)
    verify_variable_type(devdata, xr.Dataset)

    # Make sure required arguments are passed
    if varlist is None:
        raise ValueError('The "varlist" argument was not passed!')
    if met_and_masks is None:
        raise ValueError('The "met_and_masks" argument was not passed!')

    # Read the species database files in the Ref & Dev rundirs, and
    # return a dict containing metadata for each.
    ref_metadata, dev_metadata = read_species_metadata(
        spcdb_files,
        quiet=True
    )

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # ==================================================================
    # Open file for output
    # ==================================================================

    # Create file
    try:
        f = open(outfilename, "w", encoding=ENCODING)
    except (IOError, OSError, FileNotFoundError) as e:
        raise e(f"Could not open {outfilename} for writing!") from e

    # Define a list for differences
    diff_list = []

    # Title strings
    title1 = f"### Global mass (Gg) (Trop + Strat)"
    if trop_only:
        title1 = f"### Global mass (Gg) (Trop only)"
    title2 = f"### Ref = {refstr} {ref_hdr_label}"
    title3 = f"### Dev = {devstr} {dev_hdr_label}"

    # Write a placeholder to the file that denotes where
    # the list of species with differences will be written
    placeholder = "@%% insert diff status here %%@"

    # Print header to file
    print("#" * TABLE_WIDTH, file=f)
    print(f"{title1 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{'###'  : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{title2 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{title3 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{'###'  : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{placeholder}", file=f)
    print("#" * TABLE_WIDTH, file=f)

    # Column headers
    print(f"{'' : <{COL_WIDTH-1}}{'Ref' : >{COL_WIDTH}}{'Dev' : >{COL_WIDTH}}{'Dev - Ref' : >{COL_WIDTH}}{'% diff' : >{COL_WIDTH}} {'diffs'}", file=f)

    # ==================================================================
    # Print global masses for all species
    #
    # NOTE: By this point, all secies will be in both Ref and Dev'
    # because we have added them in the calling routine
    # ==================================================================
    for v in varlist:

        # Get the species name
        spc_name = v.split("_")[1]

        # Get metadta for the given species
        ref_species_metadata = ref_metadata.get(spc_name)
        dev_species_metadata = dev_metadata.get(spc_name)
        if ref_species_metadata is None and dev_species_metadata is None:
            if verbose:
                msg = f"No metadata found for {spc_name} ... skippping"
                print(msg)
            continue

        # Specify target units
        target_units = "Gg"

        # ==============================================================
        # Convert units of Ref and save to a DataArray
        # (or skip if Ref contains NaNs everywhere)
        # ==============================================================
        refarray = refdata[v]
        if not np.isnan(refdata[v].values).all():
            refarray = convert_units(
                refarray,
                spc_name,
                ref_species_metadata,
                target_units,
                area_m2=met_and_masks["Ref_Area"],
                delta_p=met_and_masks["Ref_Delta_P"],
                box_height=met_and_masks["Ref_BxHeight"],
            )

        # ==============================================================
        # Convert units of Dev and save to a DataArray
        # (or skip if Dev contains NaNs everywhere)
        # ==============================================================
        devarray = devdata[v]
        if not np.isnan(devdata[v].values).all():
            devarray = convert_units(
                devarray,
                spc_name,
                dev_species_metadata,
                target_units,
                area_m2=met_and_masks["Dev_Area"],
                delta_p=met_and_masks["Dev_Delta_P"],
                box_height=met_and_masks["Dev_BxHeight"],
            )

        # ==============================================================
        # Print global masses for Ref and Dev
        # (we will mask out tropospheric boxes in print_totals)
        # ==============================================================
        if trop_only:
            print_totals(
                refarray,
                devarray,
                f,
                diff_list,
                masks=met_and_masks
            )
        else:
            print_totals(
                refarray,
                devarray,
                f,
                diff_list
            )

    # ==================================================================
    # Cleanup and quit
    # ==================================================================

    # Close file
    f.close()

    # Reopen file and replace placeholder text by diff_text
    insert_text_into_file(
        filename=outfilename,
        search_text=placeholder,
        replace_text=diff_list_to_text(
            refstr,
            devstr,
            diff_list,
            fancy_format=True
        ),
        width=TABLE_WIDTH
    )




[docs]
def create_mass_accumulation_table(
        refdatastart,
        refdataend,
        refstr,
        refperiodstr,
        devdatastart,
        devdataend,
        devstr,
        devperiodstr,
        varlist,
        met_and_masks,
        label,
        spcdb_files,
        trop_only=False,
        outfilename="GlobalMassAccum_TropStrat.txt",
        verbose=False,
):
    """
    Creates a table of global mass accumulation for a list of species in
    two data sets.  The data sets, which typically represent output from two
    different model versions, are usually contained in netCDF data files.

    Parameters
    ----------
    refdatastart : xr.Dataset
        The first data set to be compared (aka "Reference").
    refdataend : xr.Dataset
        The first data set to be compared (aka "Reference").
    refstr : str
        A string that can be used to identify refdata
        (e.g. a model version number or other identifier).
    refperiodstr : str
        Ref simulation period start and end.
    devdatastart : xr.Dataset
        The second data set to be compared (aka "Development").
    devdataend : xr.Dataset
        The second data set to be compared (aka "Development").
    devstr : str
        A string that can be used to identify the data set specified
        by devfile (e.g. a model version number or other identifier).
    devperiodstr : str
        Dev simulation period start and end.
    varlist : list of str
        List of species concentration variable names to include
        in the list of global totals.
    met_and_masks : dict of xr.DataArray
        Dictionary containing the meteorological variables and
        masks for the Ref and Dev datasets.
    label : str
        Label to go in the header string.  Can be used to
        pass the month & year.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    trop_only : bool, optional
        Set this switch to True if you wish to print totals
        only for the troposphere.
        Default value: False (i.e. print whole-atmosphere totals).
    outfilename : str, optional
        Name of the text file which will contain the table of
        emissions totals.
        Default value: "GlobalMass_TropStrat.txt"
    verbose : bool, optional
        Set this switch to True if you wish to print out extra
        informational messages.
        Default value: False

    Notes
    -----
    This method is mainly intended for model benchmarking purposes,
    rather than as a general-purpose tool.

    Species metadata (such as molecular weights) are read from a
    YAML file called "species_database.yml".
    """

    # ==================================================================
    # Initialization
    # ==================================================================
    verify_variable_type(refdatastart, xr.Dataset)
    verify_variable_type(refdataend, xr.Dataset)
    verify_variable_type(devdatastart, xr.Dataset)
    verify_variable_type(devdataend, xr.Dataset)

    # Make sure required arguments are passed
    if varlist is None:
        raise ValueError('The "varlist" argument was not passed!')
    if met_and_masks is None:
        raise ValueError('The "met_and_masks" argument was not passed!')

    # Read the species database files in the Ref & Dev rundirs, and
    # return a dict containing metadata for each.
    ref_metadata, dev_metadata = read_species_metadata(
        spcdb_files,
        quiet=True
    )

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # ==================================================================
    # Open file for output
    # ==================================================================

    # Create file
    try:
        f = open(outfilename, "w", encoding=ENCODING)
    except (IOError, OSError, FileNotFoundError) as e:
        raise e(f"Could not open {outfilename} for writing!") from e

    # Define a list for differences
    diff_list = []

    # Title strings
    title1 = f"### Global mass accumulation (Gg) {label} (Trop + Strat)"
    if trop_only:
        title1 = f"### Global mass accumulation (Gg) {label} (Trop only)"
    title2 = "### Computed as change in instantaneous mass across period"
    title3 = f"### Ref = {refstr}"
    title4 = f"### Dev = {devstr}"
    title5 = f"### Ref period: {refperiodstr}"
    title6 = f"### Dev period: {devperiodstr}"

    # Write a placeholder to the file that denotes where
    # the list of species with differences will be written
    placeholder = "@%% insert diff status here %%@"

    # Print header to file
    print("#" * TABLE_WIDTH, file=f)
    print(f"{title1 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{'###'  : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{title2 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{'###'  : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{title3 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{title4 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{'###'  : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{title5 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{title6 : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{'###'  : <{TABLE_WIDTH-3}}{'###'}", file=f)
    print(f"{placeholder}", file=f)
    print("#" * TABLE_WIDTH, file=f)

    # Column headers
    print(f"{'' : <{COL_WIDTH-1}}{'Ref' : >{COL_WIDTH}}{'Dev' : >{COL_WIDTH}}{'Dev - Ref' : >{COL_WIDTH}}{'% diff' : >{COL_WIDTH}} {'diffs'}", file=f)

    # ==================================================================
    # Print global masses for all species
    #
    # NOTE: By this point, all secies will be in both Ref and Dev'
    # because we have added them in the calling routine
    # ==================================================================
    for v in varlist:

        # Get the species name
        spc_name = v.split("_")[1]

        # Get a list of metadata for the given species
        ref_species_metadata = ref_metadata.get(spc_name)
        dev_species_metadata = dev_metadata.get(spc_name)
        if ref_species_metadata is None and dev_species_metadata is None:
            if verbose:
                msg = f"No properties found for {spc_name} ... skippping"
                print(msg)
            continue

        # Specify target units
        target_units = "Gg"

        # ==============================================================
        # Convert units of Ref and save to a DataArray
        # (or skip if Ref contains NaNs everywhere)
        # ==============================================================
        refarrays = refdatastart[v]
        if not np.isnan(refdatastart[v].values).all():
            refarrays = convert_units(
                refarrays,
                spc_name,
                ref_species_metadata,
                target_units,
                area_m2=met_and_masks["Refs_Area"],
                delta_p=met_and_masks["Refs_Delta_P"],
                box_height=met_and_masks["Refs_BxHeight"],
            )

        refarraye = refdataend[v]
        if not np.isnan(refdataend[v].values).all():
            refarraye = convert_units(
                refarraye,
                spc_name,
                ref_species_metadata,
                target_units,
                area_m2=met_and_masks["Refe_Area"],
                delta_p=met_and_masks["Refe_Delta_P"],
                box_height=met_and_masks["Refe_BxHeight"],
            )

        refarray = refarrays
        refarray.values = refarraye.values - refarrays.values

        # ==============================================================
        # Convert units of Dev and save to a DataArray
        # (or skip if Dev contains NaNs everywhere)
        # ==============================================================
        devarrays = devdatastart[v]
        if not np.isnan(devdatastart[v].values).all():
            devarrays = convert_units(
                devarrays,
                spc_name,
                dev_species_metadata,
                target_units,
                area_m2=met_and_masks["Devs_Area"],
                delta_p=met_and_masks["Devs_Delta_P"],
                box_height=met_and_masks["Devs_BxHeight"],
            )
        #print('devarrays: {}'.format(devarrays.values))

        devarraye = devdataend[v]
        if not np.isnan(devdataend[v].values).all():
            devarraye = convert_units(
                devarraye,
                spc_name,
                dev_species_metadata,
                target_units,
                area_m2=met_and_masks["Deve_Area"],
                delta_p=met_and_masks["Deve_Delta_P"],
                box_height=met_and_masks["Deve_BxHeight"],
            )

        devarray = devarrays
        devarray.values = devarraye.values - devarrays.values

        # ==============================================================
        # Print global masses for Ref and Dev
        # (we will mask out tropospheric boxes in print_totals)
        # ==============================================================
        # ewl: for now trop_only is always false for accumulation table
        if trop_only:
            print_totals(
                refarray,
                devarray,
                f,
                diff_list,
                masks=met_and_masks
            )
        else:
            print_totals(
                refarray,
                devarray,
                f,
                diff_list
            )

    # ==================================================================
    # Cleanup and quit
    # ==================================================================

    # Close file
    f.close()

    # Reopen file and replace placeholder text by diff_text
    insert_text_into_file(
        filename=outfilename,
        search_text=placeholder,
        replace_text=diff_list_to_text(
            refstr,
            devstr,
            diff_list,
            fancy_format=True
        ),
        width=TABLE_WIDTH
    )




[docs]
def make_benchmark_conc_plots(
        ref,
        refstr,
        dev,
        devstr,
        spcdb_files,
        dst="./benchmark",
        subdst=None,
        overwrite=False,
        verbose=False,
        collection="SpeciesConc",
        benchmark_type="FullChemBenchmark",
        cmpres=None,
        plot_by_spc_cat=True,
        restrict_cats=[],
        plots=["sfc", "500hpa", "zonalmean"],
        use_cmap_RdBu=False,
        log_color_scale=False,
        sigdiff_files=None,
        normalize_by_area=False,
        cats_in_ugm3=["Aerosols", "Secondary_Organic_Aerosols"],
        areas=None,
        refmet=None,
        devmet=None,
        weightsdir='.',
        n_job=-1,
        second_ref=None,
        second_dev=None,
        time_mean=False,
):
    """
    Creates PDF files containing plots of species concentration
    for model benchmarking purposes.

    Parameters
    ----------
    ref : str
        Path name for the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Path name for the "Dev" (aka "Development") data set.
        This data set will be compared against the "Reference"
        data set.
    devstr : str
        A string to describe dev (e.g. version number).
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    dst : str, optional
        A string denoting the destination folder where a PDF
        file containing plots will be written.
        Default value: ./benchmark
    subdst : str, optional
        A string denoting the sub-directory of dst where PDF
        files containing plots will be written.  In practice,
        subdst is only needed for the 1-year benchmark output,
        and denotes a date string (such as "Jan2016") that
        corresponds to the month that is being plotted.
        Default value: None
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False
    collection : str, optional
        Name of collection to use for plotting.
        Default value: "SpeciesConc"
    benchmark_type : str, optional
        A string denoting the type of benchmark output to plot, options are
        FullChemBenchmark, TransportTracersBenchmark, or CH4Benchmark.
        Default value: "FullChemBenchmark"
    cmpres : str, optional
        Grid resolution at which to compare ref and dev data, e.g. '1x1.25'.
    plot_by_spc_cat : bool, optional
        Set this flag to False to send plots to one file rather
        than separate file per category.
        Default value: True
    restrict_cats : list of str, optional
        List of benchmark categories in benchmark_categories.yml to make
        plots for. If empty, plots are made for all categories.
        Default value: empty
    plots : list of str, optional
        List of plot types to create.
        Default value: ['sfc', '500hpa', 'zonalmean']
    log_color_scale : bool, optional
        Set this flag to True to enable plotting data (not diffs)
        on a log color scale.
        Default value: False
    normalize_by_area : bool, optional
        Set this flag to true to enable normalization of data
        by surface area (i.e. kg s-1 --> kg s-1 m-2).
        Default value: False
    cats_in_ugm3 : list of str, optional
        List of benchmark categories to to convert to ug/m3.
        Default value: ["Aerosols", "Secondary_Organic_Aerosols"]
    areas : dict of xr.DataArray, optional
        Grid box surface areas in m2 on Ref and Dev grids.
        Default value: None
    refmet : str, optional
        Path name for ref meteorology.
        Default value: None
    devmet : str, optional
        Path name for dev meteorology.
        Default value: None
    sigdiff_files : list of str, optional
        Filenames that will contain the lists of species having
        significant differences in the 'sfc', '500hpa', and
        'zonalmean' plots.  These lists are needed in order to
        fill out the benchmark approval forms.
        Default value: None
    weightsdir : str, optional
        Directory in which to place (and possibly reuse) xESMF regridder
        netCDF files.
        Default value: '.'
    n_job : int, optional
        Defines the number of simultaneous workers for parallel plotting.
        Set to 1 to disable parallel plotting. Value of -1 allows the
        application to decide.
        Default value: -1
    second_ref : str, optional
        Path name for a second "Ref" (aka "Reference") data set for
        diff-of-diffs plotting. This dataset should have the same model
        type and grid as ref.
        Default value: None
    second_dev : str, optional
        Path name for a second "Ref" (aka "Reference") data set for
        diff-of-diffs plotting. This dataset should have the same model
        type and grid as ref.
        Default value: None
    time_mean : bool, optional
        Determines if we should average the datasets over time.
        Default value: False
    """

    # NOTE: this function could use some refactoring;
    # abstract processing per category?

    # ==================================================================
    # Initialization and data read
    # ==================================================================
    verify_variable_type(ref, (str, list))
    verify_variable_type(refstr, str)
    verify_variable_type(dev, (str, list))
    verify_variable_type(devstr, str)
    verify_variable_type(spcdb_files, list)

    # Create the destination folder
    make_directory(dst, overwrite)

    # Define extra title text (usually a date string)
    # for the top-title of the plot
    if subdst is not None:
        extra_title_txt = subdst
    else:
        extra_title_txt = None

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # Pick the proper function to read the data
    reader = dataset_reader(time_mean, verbose=verbose)

    # Open datasets
    refds = reader(ref, drop_variables=SKIP_THESE_VARS)
    devds = reader(dev, drop_variables=SKIP_THESE_VARS)

    # Rename SpeciesConc_ to SpeciesConcVV_ for consistency with new
    # naming introduced in GEOS-Chem 14.1.0
    refds = rename_speciesconc_to_speciesconcvv(refds)
    devds = rename_speciesconc_to_speciesconcvv(devds)

    # -----------------------------------------------------------------
    # Kludge, rename wrong variable name
    if "SpeciesConcVV_PFE" in refds.data_vars.keys():
        refds = refds.rename({"SpeciesConcVV_PFE": "SpeciesConcVV_pFe"})
    if "SpeciesConcVV_PFE" in devds.data_vars.keys():
        devds = devds.rename({"SpeciesConcVV_PFE": "SpeciesConcVV_pFe"})
    # -----------------------------------------------------------------

    if verbose:
        print('\nPrinting refds (comparison ref)\n')
        print(refds)
        print('\nPrinting devds (comparison dev)\n')
        print(devds)

    # Open met datasets if passed as arguments
    refmetds = None
    devmetds = None
    if refmet:
        refmetds = reader(refmet, drop_variables=SKIP_THESE_VARS)
    if devmet:
        devmetds = reader(devmet, drop_variables=SKIP_THESE_VARS)

    # Determine if doing diff-of-diffs
    diff_of_diffs = second_ref is not None and second_dev is not None

    if diff_of_diffs:

        # --------------------------------------------------------------
        # %%%%% We are plotting diff-of-diffs %%%%%
        #
        # Open the second Ref and Dev datasets, if they have been
        # passed as keyword arguments, as these are needed for the
        # diff-of-diffs plots.  Regrid to the same # horizontal grid
        # resolution if the two Refs or two Devs are not on the same
        # grid.
        #
        # Also, do not take the time mean (e.g. Annual Mean) of the
        # datasets, as this will compute the the difference of means.
        # Instead, we need to compute the mean of differences.  This
        # will be done in the plotting functions compare_single_level
        # and compare_zonal_mean.
        # --------------------------------------------------------------
        second_refds = reader(second_ref, drop_variables=SKIP_THESE_VARS)
        second_devds = reader(second_dev, drop_variables=SKIP_THESE_VARS)
        
        if verbose:
            print('\nPrinting second_refds (dev of ref for diff-of-diffs)\n')
            print(second_refds)
            print('\nPrinting second_devds (dev of dev for diff-of-diffs)\n')
            print(second_devds)

        # Only regrid if ref grid resolutions differ.
        # Assume only may differ if both cubed-sphere.
        # If different, assume the two resolutions are C24 and C48,
        # and do the comparison at C48.
        regrid_ref = False
        if "Xdim" in refds.dims and "Xdim" in second_refds.dims:
            if refds['Xdim'].size != second_refds['Xdim'].size:
                regrid_ref = True
        regrid_dev = False
        if "Xdim" in devds.dims and "Xdim" in second_devds.dims:
            if devds['Xdim'].size != second_devds['Xdim'].size:
                regrid_dev = True

        if regrid_ref or regrid_dev:
            # Assume regridding C24 to C48 to compute the difference at C48
            regridfile=os.path.join(weightsdir,'regrid_weights_c24_to_c48.nc')
            transform = sparselt.esmf.load_weights(
                regridfile,
                input_dims=[('nf', 'Ydim', 'Xdim'), (6, 24, 24)],
                output_dims=[('nf', 'Ydim', 'Xdim'), (6, 48, 48)],
            )
            if regrid_ref and refds['Xdim'].size == 24:
                print('\nRegridding ref dataset 1 from C24 to C48\n')
                refds = sparselt.xr.apply(transform, refds)
                print(refds)
                print('\nRegrid complete\n')
            if regrid_ref and second_refds['Xdim'].size == 24:
                print('\nRegridding ref dataset 2 from C24 to C48\n')
                second_refds = sparselt.xr.apply(transform, second_refds)
                print(second_refds)
                print('\nRegrid complete\n')
            if regrid_dev and devds['Xdim'].size == 24:
                print('\nRegridding dev dataset 1 from C24 to C48\n')
                devds = sparselt.xr.apply(transform, devds)
                print(devds)
                print('\nRegrid complete\n')
            if regrid_dev and second_devds['Xdim'].size == 24:
                print('\nRegridding dev dataset 2 from C24 to C48\n')
                second_devds = sparselt.xr.apply(transform, second_devds)
                print(second_devds)
                print('\nRegrid complete\n')

    else:

        # --------------------------------------------------------------
        # %%%%% We are not plotting diff-of-diffs %%%%%
        #
        # We do not need the second Ref and Dev data.
        # We can also compute the time mean (e.g. Annual Mean) here.
        # --------------------------------------------------------------
        second_refds = None
        second_devds = None
        if time_mean:
            refds = dataset_mean(refds)
            devds = dataset_mean(devds)
            refmetds = dataset_mean(refmetds)
            devmetds = dataset_mean(devmetds)

    # Create regridding files if necessary while not in parallel loop
    [_ for _ in create_regridders(refds, devds, weightsdir=weightsdir)]

    # If we are normalizing by area, then merge the surface areas
    # on the Ref & Dev grids into the Ref & Dev datasets, but only
    # if they are not already there. The area variables should both
    # be called 'AREA' and be in units of m2.
    if normalize_by_area:
        if areas is not None:
            if 'AREA' not in refds.data_vars:
                refds = xr.merge([refds, areas["Ref"]])
            if 'AREA' not in devds.data_vars:
                devds = xr.merge([devds, areas["Dev"]])
            if diff_of_diffs:
                if 'AREA' not in second_refds.data_vars:
                    second_refds = xr.merge([second_refds, areas["Ref"]])
                if 'AREA' not in second_devds.data_vars:
                    second_devds = xr.merge([second_devds, areas["Dev"]])
        else:
            msg = "ERROR: normalize_by_area = True but " \
                + "the 'areas' argument was not passed!"
            raise ValueError(msg)

    # ==================================================================
    # If sending plots to one file then do all plots here and return.
    # Keep original units for plots.
    # ==================================================================
    if not plot_by_spc_cat:
        [refds, devds] = add_missing_variables(refds, devds)
        var_prefix = 'SpeciesConcVV_'
        varlist = [k for k in refds.data_vars.keys() if var_prefix in k]
        varlist.sort()

        # Surface
        pdfname = os.path.join(dst, 'SpeciesConc_Sfc.pdf')
        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            cmpres=cmpres,
            pdfname=pdfname,
            use_cmap_RdBu=use_cmap_RdBu,
            log_color_scale=log_color_scale,
            extra_title_txt=extra_title_txt,
            normalize_by_area=normalize_by_area,
            weightsdir=weightsdir,
            second_ref=second_refds,
            second_dev=second_devds,
            spcdb_files=spcdb_files
        )

        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=var_prefix,
            verbose=verbose
        )

        # 500 hPa
        pdfname = os.path.join(dst, 'SpeciesConc_500hPa.pdf')
        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            ilev=22,
            varlist=varlist,
            cmpres=cmpres,
            pdfname=pdfname,
            use_cmap_RdBu=use_cmap_RdBu,
            log_color_scale=log_color_scale,
            normalize_by_area=normalize_by_area,
            extra_title_txt=extra_title_txt,
            weightsdir=weightsdir,
            second_ref=second_refds,
            second_dev=second_devds,
            spcdb_files=spcdb_files
        )

        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=var_prefix,
            verbose=verbose
        )

        # Zonal mean
        pdfname = os.path.join(dst, 'SpeciesConc_ZnlMn.pdf')
        compare_zonal_mean(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            pdfname=pdfname,
            use_cmap_RdBu=use_cmap_RdBu,
            log_color_scale=log_color_scale,
            normalize_by_area=normalize_by_area,
            extra_title_txt=extra_title_txt,
            weightsdir=weightsdir,
            second_ref=second_refds,
            second_dev=second_devds,
            spcdb_files=spcdb_files
        )

        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=var_prefix,
            verbose=verbose
        )
        return

    # ==================================================================
    # Otherwise plot by categories. Convert units to ug/m3 for
    # aerosol categories: Aerosols and Secondary Organic Aerosols.
    # ==================================================================

    # FullChemBenchmark has lumped species (TransportTracers, CH4 do not)
    if "FullChem" in benchmark_type:
        print("\nComputing lumped species for full chemistry benchmark")

        print("-->Adding lumped species to ref dataset")
        refds = add_lumped_species_to_dataset(refds)

        print("-->Adding lumped species to dev dataset")
        devds = add_lumped_species_to_dataset(devds)

        if diff_of_diffs:
            print("-->Adding lumped species to second ref and dev datasets")
            second_refds = add_lumped_species_to_dataset(second_refds)
            second_devds = add_lumped_species_to_dataset(second_devds)

        archive_lumped_species_definitions(dst)
        print("Lumped species computation complete.\n")

    # Get the list of species categories
    catdict = get_species_categories(benchmark_type)
    archive_species_categories(dst)

    # Make sure that Ref and Dev datasets have the same variables.
    # Variables that are in Ref but not in Dev will be added to Dev
    # with all missing values (NaNs). And vice-versa.
    [refds, devds] = add_missing_variables(refds, devds)

    if diff_of_diffs:
        [refds, second_refds] = add_missing_variables(refds, second_refds)
        [devds, second_devds] = add_missing_variables(devds, second_devds)

    # Collection prefix
    if "SpeciesConc" in collection:
        coll_prefix = "SpeciesConcVV_"
    else:
        coll_prefix = collection.strip() + "_"

    # ==================================================================
    # Create the plots!
    # ==================================================================

    # Use dictionaries to maintain order of significant difference categories
    dict_sfc = {}
    dict_500 = {}
    dict_zm = {}

    def createplots(filecat):
        cat_diff_dict = {'sfc': [], '500': [], 'zm': []}

        # Plots units ug/m3 for certain species categories
        convert_to_ugm3 = False
        if cats_in_ugm3 is not None and filecat in cats_in_ugm3:
            convert_to_ugm3 = True

        # Suppress harmless run-time warnings from all threads
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        warnings.filterwarnings("ignore", category=UserWarning)

        # If restrict_cats list is passed,
        # skip all categories except those in the list
        if restrict_cats and filecat not in restrict_cats:
            return {filecat: cat_diff_dict}

        # Create a directory for each category.
        # If subdst is passed, then create a subdirectory in each
        # category directory (e.g. as for the 1-year benchmark).
        catdir = os.path.join(dst, filecat)
        if not os.path.isdir(catdir):
            os.mkdir(catdir)
        if subdst is not None:
            catdir = os.path.join(catdir, subdst)
            if not os.path.isdir(catdir):
                os.mkdir(catdir)

        # Get the list of variables in both Ref and Dev for each category
        # (this is computationally efficient)
        ref_vars = set(refds.data_vars)
        dev_vars = set(devds.data_vars)
        candidates = [
            coll_prefix + spc
            for subcat in catdict[filecat]
            for spc in catdict[filecat][subcat]
        ]
        varlist = \
            [var for var in candidates \
             if var in ref_vars and var in dev_vars
        ]
        warninglist = \
            [var for var in candidates \
             if var not in ref_vars or var not in dev_vars
        ]

        # Create new datasets containing only the variables for a
        # given category, as this will optimize performance.
        refds_cat = refds[varlist]
        devds_cat = devds[varlist]
        second_refds_cat = None
        if second_refds is not None:
            second_refds_cat = second_refds[varlist]
        second_devds_cat = None
        if second_devds is not None:
            second_devds_cat = second_devds[varlist]

        # -----------------------
        # Surface plots
        # -----------------------
        if "sfc" in plots:

            if subdst is not None:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_Surface_{subdst}.pdf"
                )
                print(f"creating {pdfname}")
            else:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_Surface.pdf"
                )

            diff_sfc = []
            compare_single_level(
                refds_cat,
                refstr,
                devds_cat,
                devstr,
                varlist=varlist,
                ilev=0,
                cmpres=cmpres,
                refmet=refmetds,
                devmet=devmetds,
                pdfname=pdfname,
                use_cmap_RdBu=use_cmap_RdBu,
                log_color_scale=log_color_scale,
                normalize_by_area=normalize_by_area,
                extra_title_txt=extra_title_txt,
                sigdiff_list=diff_sfc,
                weightsdir=weightsdir,
                convert_to_ugm3=convert_to_ugm3,
                second_ref=second_refds_cat,
                second_dev=second_devds_cat,
                n_job=n_job,
                spcdb_files=spcdb_files,
            )
            diff_sfc[:] = [v.replace(coll_prefix, "") for v in diff_sfc]
            cat_diff_dict['sfc'] = diff_sfc
            add_nested_bookmarks_to_pdf(
                pdfname,
                filecat,
                catdict,
                warninglist,
                remove_prefix=coll_prefix
            )

        # -----------------------
        # 500 hPa plots
        # -----------------------
        if "500hpa" in plots:

            if subdst is not None:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_500hPa_{subdst}.pdf"
                )
            else:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_500hPa.pdf"
                )

            diff_500 = []
            compare_single_level(
                refds_cat,
                refstr,
                devds_cat,
                devstr,
                varlist=varlist,
                ilev=22,
                cmpres=cmpres,
                refmet=refmetds,
                devmet=devmetds,
                pdfname=pdfname,
                use_cmap_RdBu=use_cmap_RdBu,
                log_color_scale=log_color_scale,
                normalize_by_area=normalize_by_area,
                extra_title_txt=extra_title_txt,
                sigdiff_list=diff_500,
                weightsdir=weightsdir,
                convert_to_ugm3=convert_to_ugm3,
                second_ref=second_refds_cat,
                second_dev=second_devds_cat,
                n_job=n_job,
                spcdb_files=spcdb_files
            )
            diff_500[:] = [v.replace(coll_prefix, "") for v in diff_500]
            #dict_500[filecat] = diff_500
            cat_diff_dict['500'] = diff_500
            add_nested_bookmarks_to_pdf(
                pdfname,
                filecat,
                catdict,
                warninglist,
                remove_prefix=coll_prefix
            )

        # -----------------------
        # Zonal mean plots
        # -----------------------
        if "zonalmean" in plots or "zm" in plots:

            if subdst is not None:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_FullColumn_ZonalMean_{subdst}.pdf"
                )
            else:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_FullColumn_ZonalMean.pdf"
                )

            diff_zm = []
            compare_zonal_mean(
                refds_cat,
                refstr,
                devds_cat,
                devstr,
                varlist=varlist,
                refmet=refmetds,
                devmet=devmetds,
                pdfname=pdfname,
                use_cmap_RdBu=use_cmap_RdBu,
                log_color_scale=log_color_scale,
                normalize_by_area=normalize_by_area,
                extra_title_txt=extra_title_txt,
                sigdiff_list=diff_zm,
                weightsdir=weightsdir,
                convert_to_ugm3=convert_to_ugm3,
                second_ref=second_refds_cat,
                second_dev=second_devds_cat,
                n_job=n_job,
                spcdb_files=spcdb_files
            )
            diff_zm[:] = [v.replace(coll_prefix, "") for v in diff_zm]
            #dict_zm = diff_zm
            cat_diff_dict['zm'] = diff_zm
            add_nested_bookmarks_to_pdf(
                pdfname,
                filecat,
                catdict,
                warninglist,
                remove_prefix=coll_prefix
            )

            # Strat_ZonalMean plots will use a log-pressure Y-axis, with
            # a range of 1..100 hPa, as per GCSC request. (bmy, 8/13/19)
            if subdst is not None:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_Strat_ZonalMean_{subdst}.pdf"
                )
            else:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_Strat_ZonalMean.pdf"
                )

            compare_zonal_mean(
                refds_cat,
                refstr,
                devds_cat,
                devstr,
                varlist=varlist,
                refmet=refmetds,
                devmet=devmetds,
                pdfname=pdfname,
                use_cmap_RdBu=use_cmap_RdBu,
                pres_range=[1, 100],
                log_yaxis=True,
                extra_title_txt=extra_title_txt,
                log_color_scale=log_color_scale,
                normalize_by_area=normalize_by_area,
                convert_to_ugm3=convert_to_ugm3,
                weightsdir=weightsdir,
                second_ref=second_refds_cat,
                second_dev=second_devds_cat,
                n_job=n_job,
                spcdb_files=spcdb_files
            )
            add_nested_bookmarks_to_pdf(
                pdfname,
                filecat,
                catdict,
                warninglist,
                remove_prefix=coll_prefix
            )
        return {filecat: cat_diff_dict}

    # --------------------------------------------
    # Create the plots in parallel
    # Turn off parallelization if n_job=1
    if n_job != 1:
        results = Parallel(n_jobs=n_job)(
            delayed(createplots)(filecat)
            for _, filecat in enumerate(catdict)
        )
    else:
        results = []
        for _, filecat in enumerate(catdict):
            results.append(createplots(filecat))
    # --------------------------------------------

    dict_sfc = {list(result.keys())[0]: result[list(
        result.keys())[0]]['sfc'] for result in results}
    dict_500 = {list(result.keys())[0]: result[list(
        result.keys())[0]]['500'] for result in results}
    dict_zm = {list(result.keys())[0]: result[list(
        result.keys())[0]]['zm'] for result in results}

    # ==============================================================
    # Write the list of species having significant differences,
    # which we need to fill out the benchmark approval forms.
    # ==============================================================
    if sigdiff_files is not None:
        for filename in sigdiff_files:
            if "sfc" in plots:
                if "sfc" in filename:
                    with open(filename, "a+", encoding=ENCODING) as f:
                        for c, diff_list in dict_sfc.items():
                            print(f"* {c}: ", file=f, end="")
                            for v in diff_list:
                                print(f"{v} ", file=f, end="")
                            print(file=f)
                        f.close()

            if "500hpa" in plots:
                if "500hpa" in filename:
                    with open(filename, "a+", encoding=ENCODING) as f:
                        for c, diff_list in dict_500.items():
                            print(f"* {c}: ", file=f, end="")
                            for v in diff_list:
                                print(f"{v} ", file=f, end="")
                            print(file=f)
                        f.close()

            if "zonalmean" in plots or "zm" in plots:
                if "zonalmean" in filename or "zm" in filename:
                    with open(filename, "a+", encoding=ENCODING) as f:
                        for c, diff_list in dict_zm.items():
                            print(f"* {c}: ", file=f, end="")
                            for v in diff_list:
                                print(f"{v} ", file=f, end="")
                            print(file=f)
                        f.close()

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    del refmetds
    del devmetds
    del second_ref
    del second_dev
    gc.collect()




[docs]
def make_benchmark_emis_plots(
        ref,
        refstr,
        dev,
        devstr,
        spcdb_files,
        dst="./benchmark",
        subdst=None,
        plot_by_spc_cat=False,
        plot_by_hco_cat=False,
        benchmark_type="FullChemBenchmark",
        cmpres=None,
        overwrite=False,
        verbose=False,
        flip_ref=False,
        flip_dev=False,
        log_color_scale=False,
        sigdiff_files=None,
        weightsdir='.',
        n_job=-1,
        time_mean=False,
):
    """
    Creates PDF files containing plots of emissions for model
    benchmarking purposes. This function is compatible with benchmark
    simulation output only. It is not compatible with transport tracers
    emissions diagnostics.

    Parameters
    ----------
    ref : str
        Path name for the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Path name for the "Dev" (aka "Development") data set.
        This data set will be compared against the "Reference"
        data set.
    devstr : str
        A string to describe dev (e.g. version number).
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    dst : str, optional
        A string denoting the destination folder where
        PDF files containing plots will be written.
        Default value: './benchmark'
    subdst : str, optional
        A string denoting the sub-directory of dst where PDF
        files containing plots will be written.  In practice,
        and denotes a date string (such as "Jan2016") that
        corresponds to the month that is being plotted.
        Default value: None
    plot_by_spc_cat : bool, optional
        Set this flag to True to separate plots into PDF files
        according to the benchmark species categories (e.g. Oxidants,
        Aerosols, Nitrogen, etc.)  These categories are specified
        in the YAML file benchmark_categories.yml.
        Default value: False
    plot_by_hco_cat : bool, optional
        Set this flag to True to separate plots into PDF files
        according to HEMCO emissions categories (e.g. Anthro,
        Aircraft, Bioburn, etc.)
        Default value: False
    benchmark_type : str, optional
        A string denoting the type of benchmark output to plot, options are
        FullChemBenchmark, TransportTracersBenchmark, or CH4Benchmark.
        Default value: "FullChemBenchmark"
    cmpres : str, optional
        Grid resolution at which to compare ref and dev data, e.g. '1x1.25'.
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False
    flip_ref : bool, optional
        Set this flag to True to reverse the vertical level
        ordering in the "Ref" dataset (in case "Ref" starts
        from the top of atmosphere instead of the surface).
        Default value: False
    flip_dev : bool, optional
        Set this flag to True to reverse the vertical level
        ordering in the "Dev" dataset (in case "Dev" starts
        from the top of atmosphere instead of the surface).
        Default value: False
    log_color_scale : bool, optional
        Set this flag to True to enable plotting data (not diffs)
        on a log color scale.
        Default value: False
    sigdiff_files : list of str, optional
        Filenames that will contain the lists of species having
        significant differences in the 'sfc', '500hpa', and
        'zonalmean' plots.  These lists are needed in order to
        fill out the benchmark approval forms.
        Default value: None
    weightsdir : str, optional
        Directory in which to place (and possibly reuse) xESMF regridder
        netCDF files.
        Default value: '.'
    n_job : int, optional
        Defines the number of simultaneous workers for parallel plotting.
        Set to 1 to disable parallel plotting.
        Value of -1 allows the application to decide.
        Default value: -1
    time_mean : bool, optional
        Determines if we should average the datasets over time.
        Default value: False

    Notes
    -----
    1. If both plot_by_spc_cat and plot_by_hco_cat are
       False, then all emission plots will be placed into the
       same PDF file.
    2. Emissions that are 3-dimensional will be plotted as
       column sums.
    """
    # =================================================================
    # Initialization and data read
    # =================================================================
    verify_variable_type(ref, (str,list))
    verify_variable_type(refstr, str)
    verify_variable_type(dev, (str,list))
    verify_variable_type(devstr, str)
    verify_variable_type(spcdb_files, list)

    # Create the destination folder
    make_directory(dst, overwrite)

    # Create the "Emissions" category folder.  If subdst is passed,
    # then create a sub-folder (needed for the 1-year benchmarks).
    emisdir = os.path.join(dst, "Emissions")
    if not os.path.isdir(emisdir):
        os.mkdir(emisdir)
    if subdst is not None:
        emisdir = os.path.join(emisdir, subdst)
        if not os.path.isdir(emisdir):
            os.mkdir(emisdir)
        extra_title_txt = subdst
    else:
        extra_title_txt = None

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # Get the function that will read the dataset
    reader = dataset_reader(time_mean, verbose=verbose)

    # Ref dataset
    try:
        refds = reader(ref, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {ref}") from e

    # Dev dataset
    try:
        devds = reader(dev, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {dev}") from e

    # Compute mean of data over the time dimension (if time_mean=True)
    if time_mean:
        refds = dataset_mean(refds)
        devds = dataset_mean(devds)

    # Make sure that Ref and Dev datasets have the same variables.
    # Variables that are in Ref but not in Dev will be added to Dev
    # with all missing values (NaNs). And vice-versa.
    [refds, devds] = add_missing_variables(refds, devds)

    # Create regridding files if necessary while not in parallel loop
    [_ for _ in create_regridders(refds, devds, weightsdir=weightsdir)]

    # Combine 2D and 3D variables into an overall list
    vardict = compare_varnames(refds, devds, quiet=not verbose)
    vars2D = vardict["commonvars2D"]
    vars3D = vardict["commonvars3D"]
    varlist = vars2D + vars3D

    # ==================================================================
    # Compute column sums for 3D emissions
    # Make sure not to clobber the DataArray attributes
    # ==================================================================
    with xr.set_options(keep_attrs=True):
        for v in vars3D:
            if "lev" in refds[v].dims:
                refds[v] = refds[v].sum(dim="lev")
            if "lev" in devds[v].dims:
                devds[v] = devds[v].sum(dim="lev")

    # ==================================================================
    # If inputs plot_by* are both false, plot all emissions in same file
    # ==================================================================
    if not plot_by_spc_cat and not plot_by_hco_cat:
        if subdst is not None:
            pdfname = os.path.join(
                emisdir,
                f"Emissions_{subdst}.pdf"
            )
        else:
            pdfname = os.path.join(
                emisdir,
                "Emissions.pdf"
            )

        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            cmpres=cmpres,
            pdfname=pdfname,
            log_color_scale=log_color_scale,
            extra_title_txt=extra_title_txt,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix="Emis",
            verbose=verbose
        )
        return

    # Get emissions variables (non-inventory), categories, and species
    emis_vars = [v for v in varlist if v[:4] == "Emis"]
    emis_cats = sorted(set([v.split("_")[1] for v in emis_vars]))
    emis_spc = sorted(set([v.split("_")[0][4:] for v in emis_vars]))

    # This is fixed in 12.3.2, comment out for now (bmy, 5/1/19)
    #    # Handle Bioburn and BioBurn as same categories (temporary until 12.3.1)
    #    emis_cats.remove('BioBurn')

    # Sort alphabetically (assume English characters)
    emis_vars.sort(key=str.lower)

    # ==================================================================
    # if plot_by_hco_cat is true, make a file for each HEMCO emissions
    # category that is in the diagnostics file
    #
    # Also write the list of emission quantities that have significant
    # diffs.  We'll need that to fill out the benchmark forms.
    # ==================================================================

    if plot_by_hco_cat:
        emisspcdir = os.path.join(dst, "Emissions")
        if not os.path.isdir(emisspcdir):
            os.mkdir(emisspcdir)
        if subdst is not None:
            emisspcdir = os.path.join(emisspcdir, subdst)
            if not os.path.isdir(emisspcdir):
                os.mkdir(emisspcdir)

        # for c in emis_cats:
        def createfile_hco_cat(c):
            # Handle cases of bioburn and bioBurn (temporary until 12.3.1)
            if c == "Bioburn":
                varnames = [k for k in emis_vars
                            if any(b in k for b in ["Bioburn", "BioBurn"])
                            ]
            else:
                varnames = [k for k in emis_vars if c in k]

            # Create the PDF name.  If subdst is passed, then also add
            # subdst to the file name (e.g. as for 1-year benchmarks).
            if subdst is not None:
                pdfname = os.path.join(
                    emisspcdir,
                    f"{c}_Emissions_{subdst}.pdf"
                )
            else:
                pdfname = os.path.join(
                    emisspcdir,
                    f"{c}_Emissions.pdf"
                )
            diff_dict = {}
            diff_emis = []
            compare_single_level(
                refds,
                refstr,
                devds,
                devstr,
                varlist=varnames,
                cmpres=cmpres,
                ilev=0,
                pdfname=pdfname,
                log_color_scale=log_color_scale,
                extra_title_txt=extra_title_txt,
                sigdiff_list=diff_emis,
                weightsdir=weightsdir,
                n_job=n_job,
                spcdb_files=spcdb_files
            )

            add_bookmarks_to_pdf(
                pdfname,
                varnames,
                remove_prefix="Emis",
                verbose=verbose
            )
            # Save the list of quantities with significant differences for
            # this category into the diff_dict dictionary for use below
            diff_emis[:] = [v.replace("Emis", "") for v in diff_emis]
            diff_emis[:] = [v.replace("_" + c, "") for v in diff_emis]
            diff_dict[c] = diff_emis
            return diff_dict

        # ---------------------------------------
        # Create plots in parallel
        # Turn off parallelization if n_job=1
        if n_job != 1:
            results = Parallel(n_jobs=n_job)(
                delayed(createfile_hco_cat)(c)
                for c in emis_cats
            )
        else:
            results = []
            for c in emis_cats:
                results.append(createfile_hco_cat(c))
        # ---------------------------------------

        dict_emis = {list(result.keys())[0]: result[list(result.keys())[0]]
                     for result in results}

        # =============================================================
        # Write the list of species having significant differences,
        # which we need to fill out the benchmark approval forms.
        # =============================================================
        if sigdiff_files is not None:
            for filename in sigdiff_files:
                if "emis" in filename:
                    with open(filename, "w+", encoding=ENCODING) as f:
                        for c, diff_list in dict_emis.items():
                            print(f"* {c}: ", file=f, end="")
                            for v in diff_list:
                                print(f"{v} ", file=f, end="")
                            print(file=f)
                        f.close()

    # ==================================================================
    # if plot_by_spc_cat is true, make a file for each benchmark
    # species category with emissions in the diagnostics file
    # ==================================================================
    if plot_by_spc_cat:

        catdict = get_species_categories(benchmark_type)
        # in case any emissions are skipped (for use in nested pdf bookmarks)
        warninglist = []
        # for checking if emissions species not defined in benchmark category
        # file
        allcatspc = []
        emisdict = {}  # used for nested pdf bookmarks
        # for i, filecat in enumerate(catdict):

        def createfile_bench_cat(filecat):
            # Get emissions for species in this benchmark category
            varlist = []
            emisdict[filecat] = {}
            catspc = []
            for subcat in catdict[filecat]:
                for spc in catdict[filecat][subcat]:
                    catspc.append(spc)
                    if spc in emis_spc:
                        emisdict[filecat][spc] = []
                        emisvars = [v for v in emis_vars
                                    if spc == v.split("_")[0][4:]]
                        for var in emisvars:
                            emisdict[filecat][spc].append(
                                var.replace("Emis", ""))
                            varlist.append(var)
            if not varlist:
                print(
                    "\nWarning: no emissions species in benchmark species" + \
                    f"category {filecat}"
                )
                return catspc

            # Use same directory structure as for concentration plots
            catdir = os.path.join(dst, filecat)
            if not os.path.isdir(catdir):
                os.mkdir(catdir)
            if subdst is not None:
                catdir = os.path.join(catdir, subdst)
                if not os.path.isdir(catdir):
                    os.mkdir(catdir)

            # Create emissions file for this benchmark species category
            # If subdst is passed, add it to the pdf name (e.g. as
            # is needed for the 1-year benchmarks).
            if subdst is not None:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_Emissions_{subdst}.pdf"
                )
            else:
                pdfname = os.path.join(
                    catdir,
                    f"{filecat}_Emissions.pdf"
                )
            # Create the PDF
            compare_single_level(
                refds,
                refstr,
                devds,
                devstr,
                varlist=varlist,
                cmpres=cmpres,
                ilev=0,
                pdfname=pdfname,
                flip_ref=flip_ref,
                flip_dev=flip_dev,
                log_color_scale=log_color_scale,
                extra_title_txt=extra_title_txt,
                weightsdir=weightsdir,
                n_job=n_job,
                spcdb_files=spcdb_files,
            )
            add_nested_bookmarks_to_pdf(
                pdfname,
                filecat,
                emisdict,
                warninglist
            )
            return catspc

        #------------------------------------------------
        # Create plots in parallel
        # Turn of parallalization if n_job=1
        if n_job != 1:
            results = Parallel(n_jobs=n_job)(
                delayed(createfile_bench_cat)(filecat)
                for _, filecat in enumerate(catdict)
            )
        else:
            results = []
            for _, filecat in enumerate(catdict):
                results.append(createfile_bench_cat(filecat))
        #------------------------------------------------

        allcatspc = [spc for result in results for spc in result]
        # Give warning if emissions species is not assigned a benchmark
        # category
        for spc in emis_spc:
            if spc not in allcatspc:
                print(\
                    f"Warning: species {spc} has emissions diagnostics but is not"
                      " in benchmark_categories.yml"
                )

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    gc.collect()




[docs]
def make_benchmark_emis_tables(
        reflist,
        refstr,
        devlist,
        devstr,
        spcdb_files,
        dst="./benchmark",
        benchmark_type="FullChemBenchmark",
        refmet=None,
        devmet=None,
        overwrite=False,
        ref_interval=[2678400.0],
        dev_interval=[2678400.0],
):
    """
    Creates a text file containing emission totals by species and
    category for benchmarking purposes.

    Parameters
    ----------
    reflist : str or list of str
        Path name(s) of the emissions file(s) that constitute
        the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    devlist : str or list of str
        Path name(s) of the emissions file(s) that constitute
        the "Dev" (aka "Development") data set.
    devstr : str
        A string to describe dev (e.g. version number).
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    dst : str, optional
        A string denoting the destination folder where the file
        containing emissions totals will be written.
        Default value: ./benchmark
    benchmark_type : str, optional
        A string denoting the type of benchmark output to plot, options are
        FullChemBenchmark, TransportTracersBenchmark or CH4Benchmark.
        Default value: "FullChemBenchmark"
    refmet : str, optional
        Path name for ref meteorology.
        Default value: None
    devmet : str, optional
        Path name for dev meteorology.
        Default value: None
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False
    ref_interval : list of float, optional
        The length of the ref data interval in seconds. By default, interval
        is set to [2678400.0], which is the number of seconds in July
        (our 1-month benchmarking month).
        Default value: [2678400.0]
    dev_interval : list of float, optional
        The length of the dev data interval in seconds. By default, interval
        is set to [2678400.0], which is the number of seconds in July
        (our 1-month benchmarking month).
        Default value: [2678400.0]
    """

    # ==================================================================
    # Initialization
    # ==================================================================
    verify_variable_type(reflist, (str,list))
    verify_variable_type(refstr, str)
    verify_variable_type(devlist, (str,list))
    verify_variable_type(devstr, str)
    verify_variable_type(spcdb_files, list)

    # Create the destination folder
    make_directory(dst, overwrite)

    # Create the "Tables" category folder if it does not exist
    emisdir = os.path.join(dst, "Tables")
    if not os.path.isdir(emisdir):
        os.mkdir(emisdir)

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # ==================================================================
    # Read data from netCDF into Dataset objects
    # ==================================================================

    # Read the input datasets
    # Also read the meteorology datasets if passed. These are optional
    # since the refds and devds have variable AREA already (always true)
    # and unit conversions do not require any meteorology.
    if isinstance(reflist, str):
        reflist = [reflist]
    if isinstance(devlist, str):
        devlist = [devlist]
    refmetds = None
    devmetds = None

    refds = xr.open_mfdataset(
        reflist,
        drop_variables=SKIP_THESE_VARS
    )
    devds = xr.open_mfdataset(
        devlist,
        drop_variables=SKIP_THESE_VARS
    )
    if refmet is not None:
        refmetds = xr.open_mfdataset(
            refmet,
            drop_variables=SKIP_THESE_VARS
        )
    if devmet is not None:
        devmetds = xr.open_mfdataset(
            devmet,
            drop_variables=SKIP_THESE_VARS
        )

    # ==================================================================
    # Create table of emissions
    # ==================================================================

    # Emissions species dictionary
    spc_dict = read_config_file(
        os.path.join(
            os.path.dirname(__file__),
            EMISSION_SPC
        ),
        quiet=True
    )
    species=spc_dict[benchmark_type]
    inv_dict = read_config_file(
        os.path.join(
            os.path.dirname(__file__),
            EMISSION_INV
        ),
        quiet=True
    )
    inventories=inv_dict[benchmark_type]

    # Destination files
    file_emis_totals = os.path.join(emisdir, "Emission_totals.txt")
    file_inv_totals = os.path.join(emisdir, "Inventory_totals.txt")

    # Create table of emissions by species
    create_total_emissions_table(
        refds,
        refstr,
        devds,
        devstr,
        species,
        spcdb_files,
        file_emis_totals,
        ref_interval,
        dev_interval,
        template="Emis{}_",
        refmetdata=refmetds,
        devmetdata=devmetds,
    )

    # Create table of emissions by inventory
    create_total_emissions_table(
        refds,
        refstr,
        devds,
        devstr,
        inventories,
        spcdb_files,
        file_inv_totals,
        ref_interval,
        dev_interval,
        template="Inv{}_",
        refmetdata=refmetds,
        devmetdata=devmetds,
    )

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    del refmetds
    del devmetds
    gc.collect()




[docs]
def make_benchmark_jvalue_plots(
        ref,
        refstr,
        dev,
        devstr,
        spcdb_files,
        varlist=None,
        dst="./benchmark",
        subdst=None,
        local_noon_jvalues=False,
        cmpres=None,
        plots=["sfc", "500hpa", "zonalmean"],
        overwrite=False,
        verbose=False,
        flip_ref=False,
        flip_dev=False,
        log_color_scale=False,
        sigdiff_files=None,
        weightsdir='.',
        n_job=-1,
        time_mean=False,
):
    """
    Creates PDF files containing plots of J-values for model
    benchmarking purposes.

    Parameters
    ----------
    ref : str
        Path name for the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Path name for the "Dev" (aka "Development") data set.
        This data set will be compared against the "Reference"
        data set.
    devstr : str
        A string to describe dev (e.g. version number).
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    varlist : list of str, optional
        List of J-value variables to plot.  If not passed,
        then all J-value variables common to both dev
        and ref will be plotted.  The varlist argument can be
        a useful way of restricting the number of variables
        plotted to the pdf file when debugging.
        Default value: None
    dst : str, optional
        A string denoting the destination folder where a
        PDF file containing plots will be written.
        Default value: ./benchmark.
    subdst : str, optional
        A string denoting the sub-directory of dst where PDF
        files containing plots will be written.  In practice,
        subdst is only needed for the 1-year benchmark output,
        and denotes a date string (such as "Jan2016") that
        corresponds to the month that is being plotted.
        Default value: None
    local_noon_jvalues : bool, optional
        Set this flag to plot local noon J-values.  This will
        divide all J-value variables by the JNoonFrac counter,
        which is the fraction of the time that it was local noon
        at each location.
        Default value: False
    cmpres : str, optional
        Grid resolution at which to compare ref and dev data, e.g. '1x1.25'.
    plots : list of str, optional
        List of plot types to create.
        Default value: ['sfc', '500hpa', 'zonalmean']
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False.
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False
    flip_ref : bool, optional
        Set this flag to True to reverse the vertical level
        ordering in the "Ref" dataset (in case "Ref" starts
        from the top of atmosphere instead of the surface).
        Default value: False
    flip_dev : bool, optional
        Set this flag to True to reverse the vertical level
        ordering in the "Dev" dataset (in case "Dev" starts
        from the top of atmosphere instead of the surface).
        Default value: False
    log_color_scale : bool, optional
        Set this flag to True if you wish to enable plotting data
        (not diffs) on a log color scale.
        Default value: False
    sigdiff_files : list of str, optional
        Filenames that will contain the lists of J-values having
        significant differences in the 'sfc', '500hpa', and
        'zonalmean' plots.  These lists are needed in order to
        fill out the benchmark approval forms.
        Default value: None
    weightsdir : str, optional
        Directory in which to place (and possibly reuse) xESMF regridder
        netCDF files.
        Default value: '.'
    n_job : int, optional
        Defines the number of simultaneous workers for parallel plotting.
        Set to 1 to disable parallel plotting. Value of -1 allows the
        application to decide.
        Default value: -1
    time_mean : bool, optional
        Determines if we should average the datasets over time.
        Default value: False

    Notes
    -----
    Will create 4 files containing J-value plots:

    1. Surface values
    2. 500 hPa values
    3. Full-column zonal mean values
    4. Stratospheric zonal mean values

    These can be toggled on/off with the plots keyword argument.

    At present, we do not yet have the capability to split the
    plots up into separate files per category (e.g. Oxidants,
    Aerosols, etc.).  This is primarily due to the fact that
    we archive J-values from GEOS-Chem for individual species
    but not family species.  We could attempt to add this
    functionality later if there is sufficient demand.
    """

    # ==================================================================
    # Initialization
    # ==================================================================

    # Create the directory for output
    make_directory(dst, overwrite)

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # Get the function that will read file(s) into a Dataset
    reader = dataset_reader(time_mean, verbose=verbose)

    # Ref dataset
    try:
        refds = reader(ref, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {ref}") from e

    # Dev dataset
    try:
        devds = reader(dev, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {dev}") from e

    # Compute mean of data over the time dimension (if time_mean=True)
    if time_mean:
        refds = dataset_mean(refds)
        devds = dataset_mean(devds)

    # Make sure that Ref and Dev datasets have the same variables.
    # Variables that are in Ref but not in Dev will be added to Dev
    # with all missing values (NaNs). And vice-versa.
    [refds, devds] = add_missing_variables(refds, devds)

    # Create regridding files if necessary
    [_ for _ in create_regridders(refds, devds, weightsdir=weightsdir)]

    # Get a list of the 3D variables in both datasets
    if varlist is None:
        vardict = compare_varnames(refds, devds, quiet=not verbose)
        cmn = vardict["commonvars3D"]

    # ==================================================================
    # Local noon or continuously-averaged J-values?
    # ==================================================================
    if local_noon_jvalues:

        # Get a list of local noon J-value variables
        # (or use the varlist passed via tha argument list)
        prefix = "JNoon_"
        if varlist is None:
            varlist = [v for v in cmn if prefix in v]

        # Make sure JNoonFrac (fraction of times it was local noon
        # in each column) is present in both Ref and Dev datasets
        if "JNoonFrac" not in cmn:
            msg = "JNoonFrac is not common to Ref and Dev datasets!"
            raise ValueError(msg)

        # JNoon_* are cumulative sums of local noon J-values; we need
        # to divide these by JNoonFrac to get the average value
        refds = divide_dataset_by_dataarray(refds,
                                                 refds["JNoonFrac"], varlist)
        devds = divide_dataset_by_dataarray(devds,
                                                 devds["JNoonFrac"], varlist)

        # Subfolder of dst where PDF files will be printed
        catdir = "JValuesLocalNoon"

    else:

        # Get a list of continuously averaged J-value variables
        # (or use the varlist passed via tha argument list)
        prefix = "Jval"
        if varlist is None:
            varlist = [v for v in cmn if prefix in v]

        # Subfolder of dst where PDF files will be printed
        catdir = "JValues"

    # ==================================================================
    # Create the plots
    # ==================================================================

    # Make the output folder if it doesn't exist.  If subdst is passed,
    # then create a sub-folder of this directory (e.g. which is needed
    # for the 1-year benchmarks)
    jvdir = os.path.join(dst, catdir)
    if not os.path.isdir(jvdir):
        os.mkdir(jvdir)
    if subdst is not None:
        jvdir = os.path.join(jvdir, subdst)
        if not os.path.isdir(jvdir):
            os.mkdir(jvdir)
        extra_title_txt = subdst
    else:
        extra_title_txt = None

    # Surface plots
    if "sfc" in plots:
        if subdst is not None:
            pdfname = os.path.join(
                jvdir,
                f"{prefix}_Surface_{subdst}.pdf"
            )
        else:
            pdfname = os.path.join(
                jvdir,
                f"{prefix}_Surface.pdf"
            )

        diff_sfc = []
        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            ilev=0,
            pdfname=pdfname,
            flip_ref=flip_ref,
            flip_dev=flip_dev,
            log_color_scale=log_color_scale,
            extra_title_txt=extra_title_txt,
            sigdiff_list=diff_sfc,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        diff_sfc[:] = [v.replace(prefix, "") for v in diff_sfc]
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=prefix,
            verbose=verbose)

    # 500hPa plots
    if "500hpa" in plots:
        if subdst is not None:
            pdfname = os.path.join(
                jvdir,
                f"{prefix}_500hPa_{subdst}.pdf"
            )
        else:
            pdfname = os.path.join(
                jvdir, f"{prefix}_500hPa.pdf"
            )

        diff_500 = []
        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            cmpres=cmpres,
            ilev=22,
            pdfname=pdfname,
            flip_ref=flip_ref,
            flip_dev=flip_dev,
            log_color_scale=log_color_scale,
            extra_title_txt=extra_title_txt,
            sigdiff_list=diff_500,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        diff_500[:] = [v.replace(prefix, "") for v in diff_500]
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=prefix,
            verbose=verbose
        )

    # Full-column zonal mean plots
    if "zonalmean" in plots:
        if subdst is not None:
            pdfname = os.path.join(
                jvdir,
                f"{prefix}_FullColumn_ZonalMean_{subdst}.pdf"
            )
        else:
            pdfname = os.path.join(
                jvdir, f"{prefix}_FullColumn_ZonalMean.pdf"
            )

        diff_zm = []
        compare_zonal_mean(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            pdfname=pdfname,
            flip_ref=flip_ref,
            flip_dev=flip_dev,
            log_color_scale=log_color_scale,
            extra_title_txt=extra_title_txt,
            sigdiff_list=diff_zm,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        diff_zm[:] = [v.replace(prefix, "") for v in diff_zm]
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=prefix,
            verbose=verbose)

        # Strat_ZonalMean plots will use a log-pressure Y-axis, with
        # a range of 1..100 hPa, as per GCSC request. (bmy, 8/13/19)
        if subdst is not None:
            pdfname = os.path.join(
                jvdir,
                f"{prefix}_Strat_ZonalMean_{subdst}.pdf"
            )
        else:
            pdfname = os.path.join(
                jvdir,
                f"{prefix}_Strat_ZonalMean.pdf"
            )

        compare_zonal_mean(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            pdfname=pdfname,
            pres_range=[1, 100],
            log_yaxis=True,
            flip_ref=flip_ref,
            flip_dev=flip_dev,
            extra_title_txt=extra_title_txt,
            log_color_scale=log_color_scale,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=prefix,
            verbose=verbose
        )

        # ==============================================================
        # Write the lists of J-values that have significant differences,
        # which we need to fill out the benchmark approval forms.
        # ==============================================================
        if sigdiff_files is not None:
            for filename in sigdiff_files:
                if "sfc" in plots:
                    if "sfc" in filename:
                        with open(filename, "a+", encoding=ENCODING) as f:
                            print("* J-Values: ", file=f, end="")
                            for v in diff_sfc:
                                print(f"{v} ", file=f, end="")
                            print(file=f)
                            f.close()

                if "500" in plots:
                    if "500" in filename:
                        with open(filename, "a+", encoding=ENCODING) as f:
                            print("* J-Values: ", file=f, end="")
                            for v in diff_500:
                                print(f"{v} ", file=f, end="")
                            print(file=f)
                            f.close()

                if "zonalmean" in plots or "zm" in plots:
                    if "zonalmean" in filename or "zm" in filename:
                        with open(filename, "a+", encoding=ENCODING) as f:
                            print("* J-Values: ", file=f, end="")
                            for v in diff_zm:
                                print(f"{v} ", file=f, end="")
                            print(file=f)
                            f.close()

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    gc.collect()



[docs]
def make_benchmark_collection_2d_var_plots(
        ref,
        refstr,
        dev,
        devstr,
        colname,
        spcdb_files,
        var_prefix=None,
        varlist=None,
        dst="./benchmark",
        cmpres=None,
        overwrite=False,
        verbose=False,
        flip_ref=False,
        flip_dev=False,
        log_color_scale=False,
        weightsdir='.',
        n_job=-1,
        time_mean=False,
):
    """
    Creates PDF file containing plots comparing all 2D variables in
    a collection without special handling.

    Parameters
    ----------
    ref : str
        Path name for the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Path name for the "Dev" (aka "Development") data set.
        This data set will be compared against the "Reference"
        data set.
    devstr : str
        A string to describe dev (e.g. version number).
    colname : str
        Name of file collection, to be used in PDF name.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    var_prefix : str, optional
        Variable name prefix in file to search for. If excluded then all 2D
        variables will be plotted regardless of name.
        Default value: None
    varlist : list of str, optional
        List of variables to plot.  If not passed,
        then all variables common to both dev
        and ref will be plotted.  The varlist argument can be
        a useful way of restricting the number of variables
        plotted to the pdf file when debugging.
        Default value: None
    dst : str, optional
        A string denoting the destination folder where a
        PDF file containing plots will be written.
        Default value: ./benchmark.
    cmpres : str, optional
        Grid resolution at which to compare ref and dev data, e.g. '1x1.25'.
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False.
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False
    flip_ref : bool, optional
        Set this flag to True to reverse the vertical level
        ordering in the "Ref" dataset (in case "Ref" starts
        from the top of atmosphere instead of the surface).
        Default value: False
    flip_dev : bool, optional
        Set this flag to True to reverse the vertical level
        ordering in the "Dev" dataset (in case "Dev" starts
        from the top of atmosphere instead of the surface).
        Default value: False
    log_color_scale : bool, optional
        Set this flag to True if you wish to enable plotting data
        (not diffs) on a log color scale.
        Default value: False
    weightsdir : str, optional
        Directory in which to place (and possibly reuse) xESMF regridder
        netCDF files.
        Default value: '.'
    n_job : int, optional
        Defines the number of simultaneous workers for parallel plotting.
        Set to 1 to disable parallel plotting. Value of -1 allows the
        application to decide.
        Default value: -1
    time_mean : bool, optional
        Determines if we should average the datasets over time.
        Default value: False

    Notes
    -----
    Will create 4 files containing the following plots:

    1. Surface values
    2. 500 hPa values
    3. Full-column zonal mean values.
    4. Stratospheric zonal mean values

    These can be toggled on/off with the plots keyword argument.
    """

    # ==================================================================
    # Initialization
    # ==================================================================

    # Create the directory for output
    make_directory(dst, overwrite)

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # Get the function that will read file(s) into a Dataset
    reader = dataset_reader(time_mean, verbose=verbose)

    # Ref dataset
    try:
        refds = reader(ref, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {ref}") from e

    # Dev dataset
    try:
        devds = reader(dev, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {dev}") from e

    # Compute mean of data over the time dimension (if time_mean=True)
    if time_mean:
        refds = dataset_mean(refds)
        devds = dataset_mean(devds)

    # Make sure that Ref and Dev datasets have the same variables.
    # Variables that are in Ref but not in Dev will be added to Dev
    # with all missing values (NaNs). And vice-versa.
    [refds, devds] = add_missing_variables(refds, devds)

    # Create regridding files if necessary
    [_ for _ in create_regridders(refds, devds, weightsdir=weightsdir)]

    # Get a list of the 2D variables in both datasets
    if varlist is None:
        vardict = compare_varnames(refds, devds, quiet=not verbose)
        cmn = vardict["commonvars2D"]

    # Get a list of variables
    # (or use the varlist passed via tha argument list)
    if varlist is None:
        if var_prefix is None:
            varlist = cmn
        else:
            varlist = [v for v in cmn if var_prefix in v]

    # ==================================================================
    # Create the plots
    # ==================================================================

    # Make the output folder if it doesn't exist
    bdir = os.path.join(dst, colname)
    if not os.path.isdir(bdir):
        os.mkdir(bdir)
    extra_title_txt = None
    pdfname = os.path.join(
        bdir,
        f"{colname}_2D.pdf"
    )

    compare_single_level(
        refds,
        refstr,
        devds,
        devstr,
        varlist=varlist,
        ilev=0,
        pdfname=pdfname,
        flip_ref=flip_ref,
        flip_dev=flip_dev,
        log_color_scale=log_color_scale,
        extra_title_txt=extra_title_txt,
        weightsdir=weightsdir,
        n_job=n_job,
        spcdb_files=spcdb_files,
    )
    add_bookmarks_to_pdf(
        pdfname,
        varlist,
        verbose=verbose
    )

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    gc.collect()




[docs]
def make_benchmark_collection_3d_var_plots(
        ref,
        refstr,
        dev,
        devstr,
        colname,
        spcdb_files,
        var_prefix=None,
        varlist=None,
        dst="./benchmark",
        cmpres=None,
        plots=["sfc", "500hpa", "zonalmean"],
        overwrite=False,
        verbose=False,
        flip_ref=False,
        flip_dev=False,
        log_color_scale=False,
        weightsdir='.',
        n_job=-1,
        time_mean=False,
):
    """
    Creates PDF files containing plots comparing all 3D variables in
    a collection without special handling.

    Parameters
    ----------
    ref : str
        Path name for the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Path name for the "Dev" (aka "Development") data set.
        This data set will be compared against the "Reference"
        data set.
    devstr : str
        A string to describe dev (e.g. version number).
    colname : str
        Name of file collection, to be used in PDF name.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    var_prefix : str, optional
        Variable name prefix in file to search for. If excluded then all 2D
        variables will be plotted regardless of name.
        Default value: None
    varlist : list of str, optional
        List of J-value variables to plot.  If not passed,
        then all J-value variables common to both dev
        and ref will be plotted.  The varlist argument can be
        a useful way of restricting the number of variables
        plotted to the pdf file when debugging.
        Default value: None
    dst : str, optional
        A string denoting the destination folder where a
        PDF file containing plots will be written.
        Default value: ./benchmark.
    cmpres : str, optional
        Grid resolution at which to compare ref and dev data, e.g. '1x1.25'.
    plots : list of str, optional
        List of plot types to create.
        Default value: ['sfc', '500hpa', 'zonalmean']
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False.
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False
    flip_ref : bool, optional
        Set this flag to True to reverse the vertical level
        ordering in the "Ref" dataset (in case "Ref" starts
        from the top of atmosphere instead of the surface).
        Default value: False
    flip_dev : bool, optional
        Set this flag to True to reverse the vertical level
        ordering in the "Dev" dataset (in case "Dev" starts
        from the top of atmosphere instead of the surface).
        Default value: False
    log_color_scale : bool, optional
        Set this flag to True if you wish to enable plotting data
        (not diffs) on a log color scale.
        Default value: False
    weightsdir : str, optional
        Directory in which to place (and possibly reuse) xESMF regridder
        netCDF files.
        Default value: '.'
    n_job : int, optional
        Defines the number of simultaneous workers for parallel plotting.
        Set to 1 to disable parallel plotting. Value of -1 allows the
        application to decide.
        Default value: -1
    time_mean : bool, optional
        Determines if we should average the datasets over time.
        Default value: False

    Notes
    -----
    Will create 4 files containing the following plots:

    1. Surface values
    2. 500 hPa values
    3. Full-column zonal mean values.
    4. Stratospheric zonal mean values

    These can be toggled on/off with the plots keyword argument.
    """

    # ==================================================================
    # Initialization
    # ==================================================================

    # Create the directory for output
    make_directory(dst, overwrite)

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # Get the function that will read file(s) into a Dataset
    reader = dataset_reader(time_mean, verbose=verbose)

    # Ref dataset
    try:
        refds = reader(ref, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {ref}") from e

    # Dev dataset
    try:
        devds = reader(dev, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {dev}") from e

    # Compute mean of data over the time dimension (if time_mean=True)
    if time_mean:
        refds = dataset_mean(refds)
        devds = dataset_mean(devds)

    # Make sure that Ref and Dev datasets have the same variables.
    # Variables that are in Ref but not in Dev will be added to Dev
    # with all missing values (NaNs). And vice-versa.
    [refds, devds] = add_missing_variables(refds, devds)

    # Create regridding files if necessary
    [_ for _ in create_regridders(refds, devds, weightsdir=weightsdir)]

    # Get a list of the 3D variables in both datasets
    if varlist is None:
        vardict = compare_varnames(refds, devds, quiet=not verbose)
        cmn = vardict["commonvars3D"]

    # Get a list of variables
    # (or use the varlist passed via tha argument list)
    if varlist is None:
        if var_prefix is None:
            varlist = cmn
        else:
            varlist = [v for v in cmn if var_prefix in v]

    # ==================================================================
    # Create the plots
    # ==================================================================

    # Make the output folder if it doesn't exist
    bdir = os.path.join(dst, colname)
    if not os.path.isdir(bdir):
        os.mkdir(bdir)
    extra_title_txt = None
    pdfname = os.path.join(
        bdir,
        f"{colname}.pdf"
    )

    # Surface plots
    if "sfc" in plots:
        pdfname = os.path.join(
            bdir,
            f"{colname}_Surface.pdf"
        )

        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            ilev=0,
            pdfname=pdfname,
            flip_ref=flip_ref,
            flip_dev=flip_dev,
            log_color_scale=log_color_scale,
            extra_title_txt=extra_title_txt,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            verbose=verbose
        )

    # 500hPa plots
    if "500hpa" in plots:
        pdfname = os.path.join(
            bdir,
            f"{colname}_500hPa.pdf"
        )

        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            cmpres=cmpres,
            ilev=22,
            pdfname=pdfname,
            flip_ref=flip_ref,
            flip_dev=flip_dev,
            log_color_scale=log_color_scale,
            extra_title_txt=extra_title_txt,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            verbose=verbose
        )
    # Full-column zonal mean plots
    if "zonalmean" in plots:
        pdfname = os.path.join(
            bdir,
            f"{colname}_ZonalMean.pdf"
        )

        compare_zonal_mean(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            pdfname=pdfname,
            flip_ref=flip_ref,
            flip_dev=flip_dev,
            log_color_scale=log_color_scale,
            extra_title_txt=extra_title_txt,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            verbose=verbose
        )

        # Strat_ZonalMean plots will use a log-pressure Y-axis, with
        # a range of 1..100 hPa, as per GCSC request. (bmy, 8/13/19)
        pdfname = os.path.join(
            bdir,
            f"{colname}_Strat_ZonalMean.pdf"
        )

        compare_zonal_mean(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            pdfname=pdfname,
            pres_range=[1, 100],
            log_yaxis=True,
            flip_ref=flip_ref,
            flip_dev=flip_dev,
            extra_title_txt=extra_title_txt,
            log_color_scale=log_color_scale,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            verbose=verbose
        )

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    gc.collect()




[docs]
def make_benchmark_aod_plots(
        ref,
        refstr,
        dev,
        devstr,
        spcdb_files,
        varlist=None,
        dst="./benchmark",
        subdst=None,
        cmpres=None,
        overwrite=False,
        verbose=False,
        log_color_scale=False,
        sigdiff_files=None,
        weightsdir='.',
        n_job=-1,
        time_mean=False,
):
    """
    Creates PDF files containing plots of column aerosol optical
    depths (AODs) for model benchmarking purposes.

    Parameters
    ----------
    ref : str
        Path name for the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Path name for the "Dev" (aka "Development") data set.
        This data set will be compared against the "Reference"
        data set.
    devstr : str
        A string to describe dev (e.g. version number).
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    varlist : list of str, optional
        List of AOD variables to plot.  If not passed, then all
        AOD variables common to both Dev and Ref will be plotted.
        Use the varlist argument to restrict the number of
        variables plotted to the pdf file when debugging.
        Default value: None
    dst : str, optional
        A string denoting the destination folder where a
        PDF file containing plots will be written.
        Default value: ./benchmark.
    subdst : str, optional
        A string denoting the sub-directory of dst where PDF
        files containing plots will be written.  In practice,
        subdst is only needed for the 1-year benchmark output,
        and denotes a date string (such as "Jan2016") that
        corresponds to the month that is being plotted.
        Default value: None
    cmpres : str, optional
        Grid resolution at which to compare ref and dev data, e.g. '1x1.25'.
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False.
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False
    log_color_scale : bool, optional
        Set this flag to True to enable plotting data (not diffs)
        on a log color scale.
        Default value: False
    sigdiff_files : list of str, optional
        Filenames that will contain the list of quantities having
        having significant differences in the column AOD plots.
        These lists are needed in order to fill out the benchmark
        approval forms.
        Default value: None
    weightsdir : str, optional
        Directory in which to place (and possibly reuse) xESMF regridder
        netCDF files.
        Default value: '.'
    n_job : int, optional
        Defines the number of simultaneous workers for parallel plotting.
        Set to 1 to disable parallel plotting. Value of -1 allows the
        application to decide.
        Default value: -1
    time_mean : bool, optional
        Determines if we should average the datasets over time.
        Default value: False
    """
    # ==================================================================
    # Initialization and also read data
    # ==================================================================

    # Create destination plots directory
    make_directory(dst, overwrite)

    # Create the "Aerosols" directory as a subfolder of dst.
    # If subdst is passed, then create a subdirectory of the "Aerosols"
    # directory (e.g. which is needed for the 1-year benchmarks).
    aoddir = os.path.join(dst, "Aerosols")
    if not os.path.isdir(aoddir):
        os.mkdir(aoddir)
    if subdst is not None:
        aoddir = os.path.join(aoddir, subdst)
        if not os.path.isdir(aoddir):
            os.mkdir(aoddir)
        extra_title_txt = subdst
    else:
        extra_title_txt = None

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # Get the function that will read file(s) into a dataset
    reader = dataset_reader(time_mean, verbose=verbose)

    # Read the Ref dataset
    try:
        refds = reader(ref, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {ref}") from e

    # Read the Dev dataset
    try:
        devds = reader(dev, drop_variables=SKIP_THESE_VARS)
    except (OSError, IOError, FileNotFoundError) as e:
        raise e(f"Could not find Ref file: {dev}") from e

    # Compute mean of data over the time dimension (if time_mean=True)
    if time_mean:
        refds = dataset_mean(refds)
        devds = dataset_mean(devds)

    # Create regridding files if necessary
    [_ for _ in create_regridders(refds, devds, weightsdir=weightsdir)]

    # NOTE: GCHP diagnostic variable exports are defined before the
    # input.geos file is read.  This means "WL1" will not have been
    # replaced with "550nm" in the variable names.  Do this string
    # replace operation here, so that we can compare GCC and GCHP
    # data directly. (bmy, 4/29/19)
    with xr.set_options(keep_attrs=True):

        # Rename variables in the Ref dataset
        old2new = {}
        for v in refds.data_vars.keys():
            if "WL1" in v:
                newname = v.replace("WL1", "550nm")
                old2new[v] = newname
        refds = refds.rename(old2new)

        # Rename variables in the Dev dataset
        old2new = {}
        for v in devds.data_vars.keys():
            if "WL1" in v:
                newname = v.replace("WL1", "550nm")
                old2new[v] = newname
        devds = devds.rename(old2new)

    # Make sure that Ref and Dev datasets have the same variables.
    # Variables that are in Ref but not in Dev will be added to Dev
    # with all missing values (NaNs). And vice-versa.
    [refds, devds] = add_missing_variables(refds, devds)

    # Find common AOD variables in both datasets
    # (or use the varlist passed via keyword argument)
    if varlist is None:
        vardict = compare_varnames(refds, devds, quiet=not verbose)
        cmn3D = vardict["commonvars3D"]
        varlist = [v for v in cmn3D if "AOD" in v]

    # Dictionary and list for new display names
    newvars = read_config_file(
        os.path.join(
            os.path.dirname(__file__),
            AOD_SPC
        ),
        quiet=True
    )
    newvarlist = []

    # ==================================================================
    # Compute the total AOD by summing over the constituent members
    # ==================================================================

    # Take one of the variables so we can use its dims, coords,
    # attrs to create the DataArray object for total AOD
    v = varlist[0]

    # Create a DataArray to hold total column AOD
    # This is the same shape as the DataArray objects in refds
    reftot = xr.DataArray(
        np.zeros(refds[v].values.shape),
        name="AODTotal",
        dims=refds[v].dims,
        coords=refds[v].coords,
        attrs=refds[v].attrs,
    )

    # Create a DataArray to hold total column AOD
    # This is the same shape as the DataArray objects in devds
    devtot = xr.DataArray(
        np.zeros(devds[v].values.shape),
        name="AODTotal",
        dims=devds[v].dims,
        coords=devds[v].coords,
        attrs=devds[v].attrs,
    )

    # Save the variable attributes so that we can reattach them
    refattrs = reftot.attrs
    devattrs = devtot.attrs

    # Compute the sum of all AOD variables
    # Avoid double-counting the following:
    # (1) Individual dust AODs, use the column AOD total instead
    # (2) SOA from aqueous isoprene, which is already accounted
    #     for in AODHyg550nm_OCPI.  Also see Github issue:
    #     https://github.com/geoschem/gcpy/issues/65
    for v in varlist:
        if "_bin" in v or "AODSOAfromAqIsoprene550nm" in v:
            continue
        reftot = reftot + refds[v]
        devtot = devtot + devds[v]

    # Reattach the variable attributes
    reftot.name = "AODTotal"
    reftot.attrs = refattrs
    reftot.attrs["long_name"] = "Total aerosol optical depth"
    devtot.name = "AODTotal"
    devtot.attrs = devattrs
    devtot.attrs["long_name"] = "Total aerosol optical depth"

    # Merge these variables back into the dataset
    refds = xr.merge([refds, reftot])
    devds = xr.merge([devds, devtot])

    # Also add AODTotal to the list
    varlist.append("AODTotal")

    # ==================================================================
    # Compute column AODs
    # Create a new DataArray for each column AOD variable,
    # using the new display name, and preserving attributes.
    # Merge the new DataArrays back into the DataSets.
    # ==================================================================
    for v in varlist:

        # Get the new name for each AOD variable (it's easier to display)
        if v in newvars:
            newname = newvars[v]
            newvarlist.append(newname)
        else:
            raise ValueError(f"Could not find a display name for {v}")

        # Don't clobber existing DataArray and Dataset attributes
        with xr.set_options(keep_attrs=True):

            # Add column AOD of newname to Ref
            array = refds[v].sum(dim="lev")
            array.name = newname
            array.attrs["units"] = "1"
            refds = xr.merge([refds, array])

            # Add column AOD of newname to Dev
            array = devds[v].sum(dim="lev")
            array.name = newname
            array.attrs["units"] = "1"
            devds = xr.merge([devds, array])

    # ==================================================================
    # Create the plots
    # ==================================================================
    if subdst is not None:
        pdfname = os.path.join(
            aoddir,
            f"Aerosols_ColumnOptDepth_{subdst}.pdf"
        )
    else:
        pdfname = os.path.join(
            aoddir,
            "Aerosols_ColumnOptDepth.pdf"
        )

    diff_aod = []
    compare_single_level(
        refds,
        refstr,
        devds,
        devstr,
        varlist=newvarlist,
        cmpres=cmpres,
        ilev=0,
        pdfname=pdfname,
        log_color_scale=log_color_scale,
        extra_title_txt=extra_title_txt,
        sigdiff_list=diff_aod,
        weightsdir=weightsdir,
        n_job=n_job,
        spcdb_files=spcdb_files
    )
    diff_aod[:] = [v.replace("Column_AOD_", "") for v in diff_aod]
    add_bookmarks_to_pdf(
        pdfname,
        newvarlist,
        remove_prefix="Column_AOD_",
        verbose=verbose
    )

    # ==================================================================
    # Write the list of AOD quantities having significant differences,
    # which we will need to fill out the benchmark forms.
    # ==================================================================
    if sigdiff_files is not None:
        for filename in sigdiff_files:
            if "sfc" in filename:
                with open(filename, "a+", encoding=ENCODING) as f:
                    print("* Column AOD: ", file=f, end="")
                    for v in diff_aod:
                        print(f"{v} ", file=f, end="")
                    print(file=f)
                    f.close()

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    gc.collect()




[docs]
def make_benchmark_mass_tables(
        ref,
        refstr,
        dev,
        devstr,
        spcdb_files,
        varlist=None,
        dst="./benchmark",
        subdst=None,
        overwrite=False,
        verbose=False,
        ref_hdr_label="",
        dev_hdr_label="",
        ref_met_extra=None,
        dev_met_extra=None,
):
    """
    Creates a text file containing global mass totals by species and
    category for benchmarking purposes.

    Parameters
    ----------
    ref : str
        Pathname that will constitute
        the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Pathname that will constitute
        the "Dev" (aka "Development") data set.  The "Dev"
        data set will be compared against the "Ref" data set.
    devstr : str
        A string to describe dev (e.g. version number).
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    varlist : list of str, optional
        List of variables to include in the list of totals.
        If omitted, then all variables that are found in either
        "Ref" or "Dev" will be included.  The varlist argument
        can be a useful way of reducing the number of
        variables during debugging and testing.
        Default value: None
    dst : str, optional
        A string denoting the destination folder where the file
        containing emissions totals will be written.
        Default value: ./benchmark
    subdst : str, optional
        A string denoting the sub-directory of dst where PDF
        files containing plots will be written.  In practice,
        subdst is only needed for the 1-year benchmark output,
        and denotes a date string (such as "Jan2016") that
        corresponds to the month that is being plotted.
        Default value: None
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False.
    ref_hdr_label : str, optional
        Label for Ref, placed after refstr in the file header.
        Default value: ""
    dev_hdr_label : str, optional
        Label for Dev, placed after devstr in the file header.
        Default value: ""
    ref_met_extra : str, optional
        Path to ref Met file containing area data for use with restart files
        which do not contain the Area variable.
        Default value: ''
    dev_met_extra : str, optional
        Path to dev Met file containing area data for use with restart files
        which do not contain the Area variable.
        Default value: ''
    """
    # ==================================================================
    # Initialization
    # ==================================================================

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # ==================================================================
    # Define destination directory
    # ==================================================================
    if os.path.isdir(dst) and not overwrite:
        msg = "Directory {} exists. Pass overwrite=True to overwrite " \
            + "files in that directory, if any."
        msg = msg.format(dst)
        raise ValueError(msg)
    if not os.path.isdir(dst):
        try:
            os.makedirs(dst)
        except FileExistsError:
            pass

    # ==================================================================
    # Read data from netCDF into Dataset objects
    # ==================================================================

    # Read data
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=xr.SerializationWarning)
        refds = xr.open_dataset(ref, drop_variables=SKIP_THESE_VARS)
        devds = xr.open_dataset(dev, drop_variables=SKIP_THESE_VARS)

    # ==================================================================
    # Update GCHP restart dataset (if any)
    # ==================================================================

    # If the data is from a GCHP restart file, rename variables and
    # flip levels to match the GEOS-Chem Classic naming and level
    # conventions.  Otherwise no changes will be made.
    refds = rename_and_flip_gchp_rst_vars(refds)
    devds = rename_and_flip_gchp_rst_vars(devds)

    # ==================================================================
    # Make sure that all necessary meteorological variables are found
    # ==================================================================
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=xr.SerializationWarning)

        # Find the area variable in Ref
        if ref_met_extra is None:
            ref_area = get_area_from_dataset(refds)
        else:
            ref_area = get_area_from_dataset(
                xr.open_dataset(
                    ref_met_extra,
                    drop_variables=SKIP_THESE_VARS
                )
            )

        # Find the area variable in Dev
        if dev_met_extra is None:
            dev_area = get_area_from_dataset(devds)
        else:
            dev_area = get_area_from_dataset(
                xr.open_dataset(
                    dev_met_extra,
                    drop_variables=SKIP_THESE_VARS
                )
            )

    # Find required meteorological variables in Ref
    # (or exit with an error if we can't find them)
    metvar_list = ["Met_DELPDRY", "Met_BXHEIGHT", "Met_TropLev"]
    refmet = get_variables_from_dataset(refds, metvar_list)
    devmet = get_variables_from_dataset(devds, metvar_list)

    # ==================================================================
    # Make sure that all necessary species are found
    # ==================================================================

    # Get lists of variables names in datasets
    vardict = compare_varnames(refds, devds, quiet=not verbose)
    commonvars = vardict["commonvars3D"]
    refonly = vardict['refonly']
    devonly = vardict['devonly']

    # Narrow down the lists to only include species
    commonspc = [v for v in commonvars if "SpeciesRst_" in v]
    refonlyspc = [v for v in refonly if v.startswith('SpeciesRst_')]
    devonlyspc = [v for v in devonly if v.startswith('SpeciesRst_')]

    # Add ref only species to dev dataset with all nan values
    if refonlyspc:
        for v in refonlyspc:
            devds[v] = devds[commonspc[0]]
            devds[v].data = np.full(devds[v].shape, np.nan)
            devds[v].attrs['units'] = refds[v].units
            commonspc.append(v)

    # Add dev only species to ref dataset with all nan values
    if devonlyspc:
        for v in devonlyspc:
            refds[v] = refds[commonspc[0]]
            refds[v].data = np.full(refds[v].shape, np.nan)
            devds[v].attrs['units'] = refds[v].units
            commonspc.append(v)

    # Set list of variables to print in mass table. If this list was passed
    # as argument, check that all the vars are now in commonspc to ensure
    # in both datasets.
    if varlist:
        for v in varlist:
            if v not in commonspc:
                raise ValueError(
                    f"{dst} folder error: Variable {v} in varlist passed to make_benchmark_mass_tables is not present in Ref and Dev datasets"
                )
    else:
        varlist = commonspc

    # Sort the list of species to be printed alphabetically
    varlist.sort()

    # ==================================================================
    # Create the mask arrays for the troposphere for Ref and Dev
    # ==================================================================
    ref_tropmask = get_troposphere_mask(refmet)
    dev_tropmask = get_troposphere_mask(devmet)

    # ==================================================================
    # Create a dictionary to hold all of the meterological
    # variables and mask variables that we need to pass down
    # ==================================================================
    met_and_masks = {
        "Ref_Area": ref_area,
        "Dev_Area": dev_area,
        "Ref_Delta_P": refmet["Met_DELPDRY"],
        "Dev_Delta_P": devmet["Met_DELPDRY"],
        "Ref_BxHeight": refmet["Met_BXHEIGHT"],
        "Dev_BxHeight": devmet["Met_BXHEIGHT"],
        "Ref_TropMask": ref_tropmask,
        "Dev_TropMask": dev_tropmask,
    }

    # ==================================================================
    # Create global mass table
    # ==================================================================
    if subdst is not None:
        mass_filename = f"GlobalMass_TropStrat_{subdst}.txt"
    else:
        mass_filename = "GlobalMass_TropStrat.txt"
    mass_file = os.path.join(dst, mass_filename)
    create_global_mass_table(
        refds,
        refstr,
        devds,
        devstr,
        varlist,
        met_and_masks,
        spcdb_files,
        ref_hdr_label=ref_hdr_label,
        dev_hdr_label=dev_hdr_label,
        outfilename=mass_file,
        verbose=verbose,
    )

    # ==================================================================
    # Create tropospheric mass table
    # ==================================================================

    # If a file name has not been specified, then use the "filename"
    # keyword argument.  Otherwise generate a default filename.
    if subdst is not None:
        mass_filename = f"GlobalMass_Trop_{subdst}.txt"
    else:
        mass_filename = 'GlobalMass_Trop.txt'
    mass_file = os.path.join(dst, mass_filename)
    create_global_mass_table(
        refds,
        refstr,
        devds,
        devstr,
        varlist,
        met_and_masks,
        spcdb_files,
        ref_hdr_label=ref_hdr_label,
        dev_hdr_label=dev_hdr_label,
        outfilename=mass_file,
        trop_only=True,
        verbose=verbose,
    )

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    gc.collect()




[docs]
def make_benchmark_mass_accumulation_tables(
        ref_start,
        ref_end,
        refstr,
        refperiodstr,
        dev_start,
        dev_end,
        devstr,
        devperiodstr,
        spcdb_files,
        varlist=None,
        dst="./benchmark",
        subdst=None,
        overwrite=False,
        verbose=False,
        label="at end of simulation",
):
    """
    Creates a text file containing global mass totals by species and
    category for benchmarking purposes.

    Parameters
    ----------
    ref_start : list of str
        Pathname that will constitute
        the "Ref" (aka "Reference") data set.
    ref_end : list of str
        Pathname that will constitute
        the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    refperiodstr : str
        Ref simulation period start and end.
    dev_start : list of str
        Pathname that will constitute
        the "Dev" (aka "Development") data set.  The "Dev"
        data set will be compared against the "Ref" data set.
    dev_end : list of str
        Pathname that will constitute
        the "Dev" (aka "Development") data set.  The "Dev"
        data set will be compared against the "Ref" data set.
    devstr : str
        A string to describe dev (e.g. version number).
    devperiodstr : str
        Dev simulation period start and end.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    varlist : list of str, optional
        List of variables to include in the list of totals.
        If omitted, then all variables that are found in either
        "Ref" or "Dev" will be included.  The varlist argument
        can be a useful way of reducing the number of
        variables during debugging and testing.
        Default value: None
    dst : str, optional
        A string denoting the destination folder where the file
        containing emissions totals will be written.
        Default value: ./benchmark
    subdst : str, optional
        A string denoting the sub-directory of dst where PDF
        files containing plots will be written.  In practice,
        subdst is only needed for the 1-year benchmark output,
        and denotes a date string (such as "Jan2016") that
        corresponds to the month that is being plotted.
        Default value: None
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False.
    """
    # ==================================================================
    # Initialization
    # ==================================================================

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # ==================================================================
    # Define destination directory
    # ==================================================================
    if os.path.isdir(dst) and not overwrite:
        msg = "Directory {} exists. Pass overwrite=True to overwrite " \
            + "files in that directory, if any."
        msg = msg.format(dst)
        raise ValueError(msg)
    if not os.path.isdir(dst):
        try:
            os.makedirs(dst)
        except FileExistsError:
            pass

    # ==================================================================
    # Read data from netCDF into Dataset objects
    # ==================================================================

    print('Creating mass accumulation tables from four restart files:')
    print(f'   Ref start: {ref_start}')
    print(f'   Ref end:   {ref_end}')
    print(f'   Dev start: {dev_start}')
    print(f'   Dev end:   {dev_end}')

    # Read data
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=xr.SerializationWarning)
        refSds = xr.open_dataset(ref_start, drop_variables=SKIP_THESE_VARS)
        refEds = xr.open_dataset(ref_end, drop_variables=SKIP_THESE_VARS)
        devSds = xr.open_dataset(dev_start, drop_variables=SKIP_THESE_VARS)
        devEds = xr.open_dataset(dev_end, drop_variables=SKIP_THESE_VARS)

    # ==================================================================
    # Update GCHP restart dataset if needed
    # ==================================================================

    # If the data is from a GCHP restart file, rename variables and
    # flip levels to match the GEOS-Chem Classic naming and level
    # conventions.  Otherwise no changes will be made.
    refSds = rename_and_flip_gchp_rst_vars(refSds)
    refEds = rename_and_flip_gchp_rst_vars(refEds)
    devSds = rename_and_flip_gchp_rst_vars(devSds)
    devEds = rename_and_flip_gchp_rst_vars(devEds)

    # Add area to start restart dataset if area in end but not start
    # Need to consider area variable names used in both GC-Classic and GCHP
    # Should put this in a function (todo)
    refSkeys = refSds.data_vars.keys()
    refEkeys = refEds.data_vars.keys()
    devSkeys = devSds.data_vars.keys()
    devEkeys = devEds.data_vars.keys()
    areaVars = ["Met_AREAM2", "AREA"]
    for areaVar in areaVars:
        if areaVar in refEkeys and areaVar not in refSkeys:
            refSds[areaVar] = refEds[areaVar]
        if areaVar in devEkeys and areaVar not in devSkeys:
            devSds[areaVar] = devEds[areaVar]

    # ==================================================================
    # Make sure that all necessary meteorological variables are found
    # ==================================================================
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=xr.SerializationWarning)

        # Find the area variable in Ref
        refs_area = get_area_from_dataset(refSds)
        refe_area = get_area_from_dataset(refEds)

        # Find the area variable in Dev
        devs_area = get_area_from_dataset(devSds)
        deve_area = get_area_from_dataset(devEds)

    # Find required meteorological variables in Ref
    # (or exit with an error if we can't find them)
    metvar_list = ["Met_DELPDRY", "Met_BXHEIGHT", "Met_TropLev"]
    refsmet = get_variables_from_dataset(refSds, metvar_list)
    refemet = get_variables_from_dataset(refEds, metvar_list)
    devsmet = get_variables_from_dataset(devSds, metvar_list)
    devemet = get_variables_from_dataset(devEds, metvar_list)

    # ==================================================================
    # Make sure that all necessary species are found
    # ==================================================================

    # Get lists of variables names in datasets
    vardict = compare_varnames(refSds, devSds, quiet=not verbose)
    commonvars = vardict["commonvars3D"]
    refonly = vardict['refonly']
    devonly = vardict['devonly']

    # Narrow down the lists to only include species
    commonspc = [v for v in commonvars if "SpeciesRst_" in v]
    refonlyspc = [v for v in refonly if v.startswith('SpeciesRst_')]
    devonlyspc = [v for v in devonly if v.startswith('SpeciesRst_')]

    # Add ref only species to dev dataset with all nan values
    if refonlyspc:
        for v in refonlyspc:
            devSds[v] = devSds[commonspc[0]]
            devSds[v].data = np.full(devSds[v].shape, np.nan)
            devSds[v].attrs['units'] = refSds[v].units
            devEds[v] = devEds[commonspc[0]]
            devEds[v].data = np.full(devEds[v].shape, np.nan)
            devEds[v].attrs['units'] = refEds[v].units
            commonspc.append(v)

    # Add dev only species to ref dataset with all nan values
    if devonlyspc:
        for v in devonlyspc:
            refSds[v] = refSds[commonspc[0]]
            refSds[v].data = np.full(refSds[v].shape, np.nan)
            devSds[v].attrs['units'] = refSds[v].units
            refEds[v] = refEds[commonspc[0]]
            refEds[v].data = np.full(refEds[v].shape, np.nan)
            devEds[v].attrs['units'] = refEds[v].units
            commonspc.append(v)

    # Set list of variables to print in mass table. If this list was passed
    # as argument, check that all the vars are now in commonspc to ensure
    # in both datasets.
    if varlist:
        for v in varlist:
            if v not in commonspc:
                raise ValueError(
                    f"{dst} folder error: Variable {v} in varlist passed to make_benchmark_mass_tables is not present in Ref and Dev datasets"
                )
    else:
        varlist = commonspc

    # Sort the list of species to be printed alphabetically
    varlist.sort()

    # ==================================================================
    # Create the mask arrays for the troposphere for Ref and Dev
    # ==================================================================
    refs_tropmask = get_troposphere_mask(refsmet)
    refe_tropmask = get_troposphere_mask(refemet)
    devs_tropmask = get_troposphere_mask(devsmet)
    deve_tropmask = get_troposphere_mask(devemet)

    # ==================================================================
    # Create a dictionary to hold all of the meterological
    # variables and mask variables that we need to pass down
    # ==================================================================
    met_and_masks = {
        "Refs_Area": refs_area,
        "Refe_Area": refe_area,
        "Devs_Area": devs_area,
        "Deve_Area": deve_area,
        "Refs_Delta_P": refsmet["Met_DELPDRY"],
        "Refe_Delta_P": refemet["Met_DELPDRY"],
        "Devs_Delta_P": devsmet["Met_DELPDRY"],
        "Deve_Delta_P": devemet["Met_DELPDRY"],
        "Refs_BxHeight": refsmet["Met_BXHEIGHT"],
        "Refe_BxHeight": refemet["Met_BXHEIGHT"],
        "Devs_BxHeight": devsmet["Met_BXHEIGHT"],
        "Deve_BxHeight": devemet["Met_BXHEIGHT"],
        "Refs_TropMask": refs_tropmask,
        "Refe_TropMask": refe_tropmask,
        "Devs_TropMask": devs_tropmask,
        "Deve_TropMask": deve_tropmask,
    }

    # ==================================================================
    # Create global mass accumulation table
    # ==================================================================
    if subdst is not None:
        mass_filename = f"GlobalMassAccumulation_TropStrat_{subdst}.txt"
    else:
        mass_filename = "GlobalMassAccumulation_TropStrat.txt"
    mass_file = os.path.join(dst, mass_filename)
    create_mass_accumulation_table(
        refSds,
        refEds,
        refstr,
        refperiodstr,
        devSds,
        devEds,
        devstr,
        devperiodstr,
        varlist,
        met_and_masks,
        label,
        outfilename=mass_file,
        verbose=verbose,
        spcdb_files=spcdb_files,
    )

    ## ==================================================================
    ## Create tropospheric mass table
    ## ==================================================================
    #if subdst is not None:
    #    mass_filename = f"GlobalMassAccumulation_Trop_{subdst}.txt"
    #else:
    #    mass_filename = 'GlobalMassAccumulation_Trop.txt'
    #mass_file = os.path.join(dst, mass_filename)
    #create_mass_accumulation_table(
    #    refSds,
    #    refEds,
    #    refstr,
    #    devSds,
    #    devEds,
    #    devstr,
    #    varlist,
    #    met_and_masks,
    #    label,
    #    outfilename=mass_file,
    #    trop_only=True,
    #    verbose=verbose,
    #    spcdb_files=spcdb_files,
    #)

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refSds
    del refEds
    del devSds
    del devEds
    gc.collect()




[docs]
def make_benchmark_oh_metrics(
        ref,
        refmet,
        refstr,
        dev,
        devmet,
        devstr,
        dst="./benchmark",
        overwrite=False,
):
    """
    Creates a text file containing metrics of global mean OH, MCF lifetime,
    and CH4 lifetime for benchmarking purposes.

    Parameters
    ----------
    ref : str
        Path name of "Ref" (aka "Reference") data set file.
    refmet : str
        Path name of ref meteorology data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Path name of "Dev" (aka "Development") data set file.
        The "Dev" data set will be compared against the "Ref" data set.
    devmet : list of str
        Path name of dev meteorology data set.
    devstr : str
        A string to describe dev (e.g. version number).
    dst : str, optional
        A string denoting the destination folder where the file
        containing emissions totals will be written.
        Default value: "./benchmark"
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False
    """

    # ==================================================================
    # Initialization
    # ==================================================================

    # Define destination directory
    make_directory(dst, overwrite)

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # ==================================================================
    # Read data from netCDF into Dataset objects
    # ==================================================================

    refds = xr.open_dataset(ref, drop_variables=SKIP_THESE_VARS)
    devds = xr.open_dataset(dev, drop_variables=SKIP_THESE_VARS)
    refmetds = xr.open_dataset(refmet, drop_variables=SKIP_THESE_VARS)
    devmetds = xr.open_dataset(devmet, drop_variables=SKIP_THESE_VARS)

    # ==================================================================
    # Get tropopause mask
    # ==================================================================
    # Find the area variables in Ref and Dev
    #ref_area = get_area_from_dataset(refds)
    #dev_area = get_area_from_dataset(devds)

    # Find required meteorological variables in Ref
    # (or exit with an error if we can't find them)
    metvar_list = [
        "Met_AD",
        "Met_AIRDEN",
        "Met_BXHEIGHT",
        "Met_T",
        "Met_TropLev",
        "FracOfTimeInTrop",
    ]
    refmet = get_variables_from_dataset(refmetds, metvar_list)
    devmet = get_variables_from_dataset(devmetds, metvar_list)

    # Create the mask arrays for the troposphere for Ref and Dev
    ref_tropmask = get_troposphere_mask(refmetds)
    dev_tropmask = get_troposphere_mask(devmetds)

    # ==================================================================
    # Compute mass-weighted OH in the troposphere
    # ==================================================================

    # Ref
    ref_oh_trop = np.ma.masked_array(refds["OHconcAfterChem"].values,
                                     ref_tropmask)
    ref_airmass_trop = np.ma.masked_array(refmetds["Met_AD"].values,
                                          ref_tropmask)
    ref_oh_mass = ref_oh_trop * ref_airmass_trop
    ref_total_ohmass = np.sum(ref_oh_mass)
    ref_total_airmass = np.sum(ref_airmass_trop)
    ref_mean_oh = (ref_total_ohmass / ref_total_airmass) / 1e5

    # Dev
    dev_oh_trop = np.ma.masked_array(devds["OHconcAfterChem"].values,
                                     dev_tropmask)
    dev_airmass_trop = np.ma.masked_array(devmetds["Met_AD"].values,
                                          dev_tropmask)
    dev_oh_mass = dev_oh_trop * dev_airmass_trop
    dev_total_ohmass = np.sum(dev_oh_mass)
    dev_total_airmass = np.sum(dev_airmass_trop)
    dev_mean_oh = (dev_total_ohmass / dev_total_airmass) / 1e5

    # Diff
    oh_diff = dev_mean_oh - ref_mean_oh
    oh_pctdiff = ((dev_mean_oh - ref_mean_oh) / ref_mean_oh) * 100.0

    # ==================================================================
    # Compute MCF and CH4 lifetimes
    # ==================================================================

    # Select only boxes that are purely tropospheric
    # This excludes influence from the stratosphere
    ref_timetrop_mask = refmetds["FracOfTimeInTrop"].values != 1.0
    dev_timetrop_mask = devmetds["FracOfTimeInTrop"].values != 1.0

    # Get grid box volumes [cm3] (trop + strat)
    ref_vol = (refmetds["Met_BXHEIGHT"] * refmetds["AREA"]) * 1e6
    dev_vol = (devmetds["Met_BXHEIGHT"] * devmetds["AREA"]) * 1e6

    # Get grid box volumes [cm3] (trop only)
    ref_vol_trop = np.ma.masked_array(ref_vol.values, ref_timetrop_mask)
    dev_vol_trop = np.ma.masked_array(dev_vol.values, dev_timetrop_mask)

    # Get MCF and CH4 density [molec/cm3] (trop + strat)
    # Assume that species is evenly distributed in air, with
    # a mixing ratio of 1. Thus species density = air density.
    ref_dens = refmetds["Met_AIRDEN"] / 1e6
    dev_dens = devmetds["Met_AIRDEN"] / 1e6

    # Get MCF and CH4 density [molec/cm3] (trop only)
    ref_dens_trop = np.ma.masked_array(ref_dens.values, ref_timetrop_mask)
    dev_dens_trop = np.ma.masked_array(dev_dens.values, dev_timetrop_mask)

    # Get temperature [K] (trop only)
    ref_temp = np.ma.masked_array(refmetds["Met_T"].values, ref_timetrop_mask)
    dev_temp = np.ma.masked_array(devmetds["Met_T"].values, dev_timetrop_mask)

    # Compute Arrhenius parameter K [cm3/molec/s]
    ref_mcf_k = 1.64e-12 * np.exp(-1520e0 / ref_temp)
    dev_mcf_k = 1.64e-12 * np.exp(-1520e0 / dev_temp)
    ref_ch4_k = 2.45e-12 * np.exp(-1775e0 / ref_temp)
    dev_ch4_k = 2.45e-12 * np.exp(-1775e0 / dev_temp)

    # Numerator: Total atmospheric (trop+strat) burden
    ref_num = np.sum(ref_dens.values * ref_vol.values)
    dev_num = np.sum(dev_dens.values * dev_vol.values)

    # Denominator: Loss rate in troposphere
    ref_mcf_denom = np.sum(ref_mcf_k * ref_oh_trop *
                           ref_dens_trop * ref_vol_trop)
    dev_mcf_denom = np.sum(dev_mcf_k * dev_oh_trop *
                           dev_dens_trop * dev_vol_trop)
    ref_ch4_denom = np.sum(ref_ch4_k * ref_oh_trop *
                           ref_dens_trop * ref_vol_trop)
    dev_ch4_denom = np.sum(dev_ch4_k * dev_oh_trop *
                           dev_dens_trop * dev_vol_trop)

    # Compute lifetimes [years]
    sec_to_year = 365.25 * 86400.0
    ref_mcf_lifetime = (ref_num / ref_mcf_denom) / sec_to_year
    dev_mcf_lifetime = (dev_num / dev_mcf_denom) / sec_to_year
    ref_ch4_lifetime = (ref_num / ref_ch4_denom) / sec_to_year
    dev_ch4_lifetime = (dev_num / dev_ch4_denom) / sec_to_year

    # Compute differences
    mcf_diff = dev_mcf_lifetime - ref_mcf_lifetime
    ch4_diff = dev_ch4_lifetime - ref_ch4_lifetime

    mcf_pctdiff = ((dev_mcf_lifetime - ref_mcf_lifetime) /
                   ref_mcf_lifetime) * 100.0
    ch4_pctdiff = ((dev_ch4_lifetime - ref_ch4_lifetime) /
                   ref_ch4_lifetime) * 100.0

    # ==================================================================
    #  Define function for writing metrics to file
    # ==================================================================

    def print_metrics_to_file(f, title1, title2, ref, dev, diff, pctdiff):
        print("#" * 79, file=f)
        print(f"{title1 : <76}{'###'}", file=f)
        print(f"{title2 : <76}{'###'}", file=f)
        print("#" * 79, file=f)
        print("'{Ref' : <15}{'Dev' : <13}{'Dev - Ref` : <13}{'% diff' : <11}",
              file=f)
        print("{ref:11.6f}  {dev:11.6f}  {diff:11.6f}  {pctdiff:9.4f}", file=f)

    # ==================================================================
    # Print metrics to file
    # ==================================================================

    # Create file
    outfilename = os.path.join(dst, "OH_metrics.txt")
    f = open(outfilename, "w")

    # Write mean OH
    title1 = "### Global mass-weighted OH concentration [1e5 molec/cm3]"
    title2 = f"### Ref = {refstr}; Dev = {devstr}"
    print_metrics_to_file(f, title1, title2, ref_mean_oh, dev_mean_oh,
                          oh_diff, oh_pctdiff)

    # Write MCF lifetime
    title1 = "### MCF lifetime w/r/t tropospheric OH [years]"
    title2 = f"### Ref = {refstr}; Dev = {devstr}"
    print_metrics_to_file(f, title1, title2, ref_mcf_lifetime,
                          dev_mcf_lifetime, mcf_diff, mcf_pctdiff)

    # Write CH4 lifetime
    title1 = "### CH4 lifetime w/r/t tropospheric OH [years]"
    title2 = f"### Ref = {refstr}; Dev = {devstr}"
    print_metrics_to_file(f, title1, title2, ref_ch4_lifetime,
                          dev_ch4_lifetime, ch4_diff, ch4_pctdiff)

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    del refmetds
    del devmetds
    gc.collect()




[docs]
def make_benchmark_wetdep_plots(
        ref,
        refstr,
        dev,
        devstr,
        collection,
        spcdb_files,
        dst="./benchmark",
        cmpres=None,
        datestr=None,
        overwrite=False,
        verbose=False,
        benchmark_type="TransportTracersBenchmark",
        plots=["sfc", "500hpa", "zonalmean"],
        log_color_scale=False,
        normalize_by_area=False,
        areas=None,
        refmet=None,
        devmet=None,
        weightsdir='.',
        n_job=-1,
        time_mean=False,
):
    """
    Creates PDF files containing plots of species concentration
    for model benchmarking purposes.

    Parameters
    ----------
    ref : str
        Path name for the "Ref" (aka "Reference") data set.
    refstr : str
        A string to describe ref (e.g. version number).
    dev : str
        Path name for the "Dev" (aka "Development") data set.
        This data set will be compared against the "Reference"
        data set.
    devstr : str
        A string to describe dev (e.g. version number).
    collection : str
        String name of collection to plot comparisons for.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    dst : str, optional
        A string denoting the destination folder where a PDF
        file containing plots will be written.
        Default value: ./benchmark
    cmpres : str, optional
        Grid resolution at which to compare ref and dev data, e.g. '1x1.25'.
    datestr : str, optional
        A string with date information to be included in both the
        plot pdf filename and as a destination folder subdirectory
        for writing plots.
        Default value: None
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False.
    verbose : bool, optional
        Set this flag to True to print extra informational output.
        Default value: False.
    benchmark_type : str, optional
        A string denoting the type of benchmark output to plot, options are
        FullChemBenchmark, TransportTracersBenchmark, or CH4Benchmark.
        Default value: "FullChemBenchmark"
    plots : list of str, optional
        List of plot types to create.
        Default value: ['sfc', '500hpa', 'zonalmean']
    log_color_scale : bool, optional
        Set this flag to True to enable plotting data (not diffs)
        on a log color scale.
    normalize_by_area : bool, optional
        Set this flag to true to enable normalization of data
        by surface area (i.e. kg s-1 --> kg s-1 m-2).
        Default value: False
    areas : dict of xr.DataArray, optional
        Grid box surface areas in m2 on Ref and Dev grids.
        Default value: None
    refmet : str, optional
        Path name for ref meteorology.
        Default value: None
    devmet : str, optional
        Path name for dev meteorology.
        Default value: None
    weightsdir : str, optional
        Directory in which to place (and possibly reuse) xESMF regridder
        netCDF files.
        Default value: '.'
    n_job : int, optional
        Defines the number of simultaneous workers for parallel plotting.
        Set to 1 to disable parallel plotting. Value of -1 allows the
        application to decide.
        Default value: -1
    time_mean : bool, optional
        Determines if we should average the datasets over time.
        Default value: False
    """

    # Create destination plot directory
    make_directory(dst, overwrite)

    # Make a collection subdirectory
    targetdst = os.path.join(dst, collection)
    if not os.path.isdir(targetdst):
        os.mkdir(targetdst)

    # If datestr is passed, create a further subdirectory
    if datestr is not None:
        targetdst = os.path.join(targetdst, datestr)
        if not os.path.isdir(targetdst):
            os.mkdir(targetdst)

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # Get the function that will read file(s) into a dataset
    reader = dataset_reader(time_mean, verbose=verbose)

    # Open datasets
    refds = reader(ref, drop_variables=SKIP_THESE_VARS)
    devds = reader(dev, drop_variables=SKIP_THESE_VARS)

    # Open met datasets if passed as arguments
    refmetds = None
    devmetds = None
    if refmet is not None:
        refmetds = reader(refmet, drop_variables=SKIP_THESE_VARS)
    if devmet is not None:
        devmetds = reader(devmet, drop_variables=SKIP_THESE_VARS)

    # Compute mean of data over the time dimension (if time_mean=True)
    if time_mean:
        refds = dataset_mean(refds)
        devds = dataset_mean(devds)
        if refmet is not None:
            refmetds = dataset_mean(refmetds)
        if devmet is not None:
            devmetds = dataset_mean(devmetds)

    # Make sure that Ref and Dev datasets have the same variables.
    # Variables that are in Ref but not in Dev will be added to Dev
    # with all missing values (NaNs). And vice-versa.
    # Turn this off for now since add_missing_variables inserts GCC area into
    # GCHP files, which causes problems with area normalization (ewl)
    #[refds, devds] = add_missing_variables(refds, devds)

    # Get list of variables in collection
    vardict = compare_varnames(refds, devds, quiet=not verbose)
    varlist = [v for v in vardict["commonvars3D"] if collection + "_" in v]
    varlist.sort()

    # Surface plots
    if "sfc" in plots:
        if datestr is not None:
            plotfilename = f"{collection}_Surface_{datestr}.pdf"
        else:
            plotfilename = f"{collection}_Surface.pdf"
        pdfname = os.path.join(targetdst, plotfilename)
        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            cmpres=cmpres,
            ilev=0,
            refmet=refmetds,
            devmet=devmetds,
            pdfname=pdfname,
            normalize_by_area=normalize_by_area,
            extra_title_txt=datestr,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=collection + '_',
            verbose=verbose)

    # 500 hPa plots
    if "500hpa" in plots:
        if datestr is not None:
            plotfilename = f"{collection}_500hPa_{datestr}.pdf"
        else:
            plotfilename = f"{collection}_500hPa.pdf"
        pdfname = os.path.join(targetdst, plotfilename)
        compare_single_level(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            cmpres=cmpres,
            ilev=22,
            refmet=refmetds,
            devmet=devmetds,
            pdfname=pdfname,
            normalize_by_area=normalize_by_area,
            extra_title_txt=datestr,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=collection + '_',
            verbose=verbose
        )

    # Zonal mean plots
    if "zonalmean" in plots or "zm" in plots:

        # Full column
        if datestr is not None:
            plotfilename = f"{collection}_FullColumn_ZonalMean_{datestr}.pdf"
        else:
            plotfilename = f"{collection}_FullColumn_ZonalMean.pdf"
        pdfname = os.path.join(targetdst, plotfilename)
        compare_zonal_mean(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            refmet=refmetds,
            devmet=devmetds,
            pdfname=pdfname,
            log_color_scale=log_color_scale,
            normalize_by_area=normalize_by_area,
            extra_title_txt=datestr,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files,
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=collection + '_',
            verbose=verbose
        )

        # Stratosphere
        if datestr is not None:
            plotfilename = f"{collection}_Strat_ZonalMean_{datestr}.pdf"
        else:
            plotfilename = f"{collection}_Strat_ZonalMean.pdf"
        pdfname = os.path.join(targetdst, plotfilename)
        compare_zonal_mean(
            refds,
            refstr,
            devds,
            devstr,
            varlist=varlist,
            refmet=refmetds,
            devmet=devmetds,
            pdfname=pdfname,
            pres_range=[1, 100],
            log_yaxis=True,
            extra_title_txt=datestr,
            normalize_by_area=normalize_by_area,
            weightsdir=weightsdir,
            n_job=n_job,
            spcdb_files=spcdb_files
        )
        add_bookmarks_to_pdf(
            pdfname,
            varlist,
            remove_prefix=collection + '_',
            verbose=verbose
        )

    # -------------------------------------------
    # Clean up
    # -------------------------------------------
    del refds
    del devds
    del refmetds
    del devmetds
    gc.collect()




[docs]
def make_benchmark_aerosol_tables(
        devdir,
        devlist_aero,
        devlist_spc,
        devlist_met,
        devstr,
        year,
        days_per_mon,
        spcdb_files,
        dst='./benchmark',
        overwrite=False,
        is_gchp=False,
):
    """
    Compute FullChemBenchmark aerosol budgets & burdens.

    Parameters
    ----------
    devdir : str
        Path to development ("Dev") data directory.
    devlist_aero : list of str
        List of Aerosols collection files (different months).
    devlist_spc : list of str
        List of SpeciesConc collection files (different months).
    devlist_met : list of str
        List of meteorology collection files (different months).
    devstr : str
        Descriptive string for datasets (e.g. version number).
    year : str
        The year of the benchmark simulation (e.g. '2016').
    days_per_mon : list of int
        List of number of days per month for all months.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    dst : str, optional
        Directory where budget tables will be created.
        Default value: './benchmark'
    overwrite : bool, optional
        Overwrite burden & budget tables?
        Default value: False
    is_gchp : bool, optional
        Whether datasets are for GCHP.
        Default value: False
    """
    # Create destination directory
    make_directory(dst, overwrite)

    # List of species (and subsets for the trop & strat)
    species_list = [
        "BCPI", "OCPI", "SO4", "DST1", "DST2", "DST3", "DST4",
        "DSTbin1", "DSTbin2", "DSTbin3", "DSTbin4", "DSTbin5",
        "DSTbin6", "DSTbin7", "SALA", "SALC"
    ]

    # Read the species database files in the Ref & Dev rundirs, and
    # return a dict containing metadata for the union of species.
    # We'll need properties such as mol. wt. for unit conversions, etc.
    _, dev_metadata = read_species_metadata(spcdb_files, quiet=True)

    # Get the list of relevant AOD diagnostics from a YAML file
    ifile= AOD_SPC
    aod = read_config_file(
        os.path.join(
            os.path.dirname(__file__),
            ifile,
        ),
        quiet=True
    )
    ds_aer = xr.open_mfdataset(
        devlist_aero,
        drop_variables=SKIP_THESE_VARS
    )
    ds_spc = xr.open_mfdataset(
        devlist_spc,
        drop_variables=SKIP_THESE_VARS
    )
    ds_met = xr.open_mfdataset(
        devlist_met,
        drop_variables=SKIP_THESE_VARS
    )

    # Rename SpeciesConc_ to SpeciesConcVV_ for consistency with new
    # naming introduced in GEOS-Chem 14.1.0
    ds_spc = rename_speciesconc_to_speciesconcvv(ds_spc)

    # Trim species_list to the variables that are only in the dataset
    varlist = [
        var for var in species_list if f"SpeciesConcVV_{var}" in ds_spc.data_vars
    ]

    # Molecular weights [g mol-1], as taken from the species database
    mw = {}
    full_names = {}
    for var in varlist:
        full_names[var] = dev_metadata[var]["FullName"].strip()
        mw[var] = dev_metadata[var]["MW_g"]
    mw["Air"] = MW_AIR_g

    # Get troposphere mask
    tropmask = get_troposphere_mask(ds_met)

    # Get number of months
    n_mon = len(days_per_mon)

    # ------------------------------------------------------------------
    # Surface area
    # (kludgey but it works - revisit this)
    # ------------------------------------------------------------------

    # Get number of vertical levels
    N_LEVS = ds_spc.dims["lev"]

    if is_gchp:
        area = ds_met["Met_AREAM2"].values
        a = area.shape
        area_m2 = np.zeros([a[0], N_LEVS, a[1], a[2], a[3]])
        for t in range(n_mon):
            for k in range(N_LEVS):
                area_m2[t, k, :, :, :] = area[t, :, :, :]
        total_area_m2 = np.sum(area_m2[0, 0, :, :, :])
    else:
        area = ds_met["AREA"].values
        a = area.shape
        area_m2 = np.zeros([a[0], N_LEVS, a[1], a[2]])
        for t in range(n_mon):
            for k in range(N_LEVS):
                area_m2[t, k, :, :] = area[t, :, :]
        total_area_m2 = np.sum(area_m2[0, 0, :, :])

    # ------------------------------------------------------------------
    # Conversion factors and time increments
    # ------------------------------------------------------------------
    # v/v dry --> Tg
    vv_to_Tg = {}
    for var in varlist:
        if f"SpeciesConcVV_{var}" in ds_spc.data_vars:
            vv_to_Tg[var] = \
                ds_met["Met_AD"].values * (mw[var] / mw["Air"]) * 1e-9

    # Days in the benchmark duration
    days_per_yr = np.sum(days_per_mon)

    # ------------------------------------------------------------------
    # Define function to print tables
    # ------------------------------------------------------------------
    def print_aerosol_metrics(data, varlist, namelist, filename, title, label):

        with open(filename, "w+", encoding=ENCODING) as ofile:

            # Print top header
            print("%" * 79, file=ofile)
            print(f" {title} for {year} in {devstr}", file=ofile)
            print(" (weighted by the number of days per month)", file=ofile)
            print("%" * 79, file=ofile)
            line = "\n" + " " *67 + "Strat         Trop         Strat+Trop\n"
            line += " " * 67 + "-----------   ----------   ----------"
            print(line, file=ofile)

            # Print data
            for var in varlist:
                line = f"{namelist[var] : <41} ({var : <7}) {label} :  {data[var + '_s']:11.9f}   {data[var + '_t']:10.8f}   {data[var + '_f']:10.8f}\n"
                print(line, file=ofile)

    # ------------------------------------------------------------------
    # Compute aerosol burdens [Tg] and print
    # ------------------------------------------------------------------

    # Table info
    filename = f"{dst}/Aerosol_Burdens.txt"
    if n_mon == 12:
        title = "Annual average global aerosol burdens"
    else:
        title = f"Average global aerosol burdens across {n_mon} months"
    label = "burden [Tg]"

    # Initialize
    q = {}
    q_sum_f = np.zeros(n_mon)
    q_sum_t = np.zeros(n_mon)
    q_sum_s = np.zeros(n_mon)
    burdens = {}

    # Define the axes we need to sum over to make monthly sums
    if is_gchp:
        sum_axes = (1, 2, 3, 4)
    else:
        sum_axes = (1, 2, 3)

    # Loop over species
    for var in varlist:

        # Whole-atmosphere and trop-only quantities [g]
        # NOTE: DryDep is by nature trop-only
        varname = f"SpeciesConcVV_{var}"
        q[var + "_f"] = ds_spc[varname].values * vv_to_Tg[var]
        q[var + "_t"] = np.ma.masked_array(q[var + "_f"], tropmask)

        # Compute monthly sums, weighted by the number of days per month
        q_sum_f = np.sum(q[var + "_f"], axis=sum_axes) * days_per_mon
        q_sum_t = np.sum(q[var + "_t"], axis=sum_axes) * days_per_mon
        q_sum_s = q_sum_f - q_sum_t

        # Compute annual averages
        burdens[var + "_f"] = np.sum(q_sum_f) / days_per_yr
        burdens[var + "_t"] = np.sum(q_sum_t) / days_per_yr
        burdens[var + "_s"] = np.sum(q_sum_s) / days_per_yr

    print_aerosol_metrics(burdens, varlist, full_names, filename, title, label)

    # ------------------------------------------------------------------
    # Define function to print tables
    # ------------------------------------------------------------------
    def print_aods(data, varlist, filename, title):

        with open(filename, "w+", encoding=ENCODING) as ofile:

            # Print top header
            print("%" * 79, file=ofile)
            print(f" {title} for {year} in {devstr}", file=ofile)
            print(" (weighted by the number of days per month)", file=ofile)
            print("%" * 79, file=ofile)
            line = "\n" + " " *32 + "Strat         Trop         Strat+Trop\n"
            line += " " * 32 + "-----------   ----------   ----------"
            print(line, file=ofile)

            # Print data
            for var in varlist:
                line = f"{var : <4} column optical depth [1]:  {data[var + '_s']:11.9f}   {data[var + '_t']:10.8f}   {data[var + '_f']:10.8f}\n"
                print(line, file=ofile)

    # ------------------------------------------------------------------
    # Compute average AOD's [Tg] and print
    # ------------------------------------------------------------------

    # Table info
    filename = f"{dst}/Global_Mean_AOD.txt"
    if n_mon == 12:
        title = "Annual average global AODs"
    else:
        title = f"Average global AODs across {n_mon} months"
    label = "mean AOD [1]"

    # Initialize
    q = {}
    q_sum_f = np.zeros(n_mon)
    q_sum_t = np.zeros(n_mon)
    q_sum_s = np.zeros(n_mon)
    aods = {}

    # Define axes to sum over, and total surface area
    if is_gchp:
        sum_axes = (1, 2, 3, 4)
    else:
        sum_axes = (1, 2, 3)

    # List of AOD species to plot
    varlist = [var for var in aod.keys() if "Dust" in var or "Hyg" in var]
    if is_gchp:
        varlist = [var.replace('550nm', 'WL1') for var in varlist]

    # Loop over AOD variables
    aod_list = []
    for varname in varlist:

        # Extract the s
        if "Dust" in varname:
            var = "Dust"
        elif "Total" in varname:
            continue
        else:
            var = varname.split("_")[1]
        aod_list.append(var)

        # Whole-atmosphere AOD [1]
        q[var + "_f"] = ds_aer[varname].values

        # Tropospheric-only AOD [1]
        q[var + "_t"] = np.ma.masked_array(q[var + "_f"], tropmask)

        # Create monthly sums, weighted by the number of days per month
        q_sum_f = np.sum(q[var + "_f"] * area_m2, axis=sum_axes) * days_per_mon
        q_sum_t = np.sum(q[var + "_t"] * area_m2, axis=sum_axes) * days_per_mon
        q_sum_s = q_sum_f - q_sum_t

        # Take annual averages
        aods[var + "_f"] = np.sum(q_sum_f) / total_area_m2 / days_per_yr
        aods[var + "_t"] = np.sum(q_sum_t) / total_area_m2 / days_per_yr
        aods[var + "_s"] = np.sum(q_sum_s) / total_area_m2 / days_per_yr

    print_aods(aods, aod_list, filename, title)

    # ------------------------------------------------------------------
    # Clean up
    # ------------------------------------------------------------------
    del ds_aer
    del ds_spc
    del ds_met
    gc.collect()




[docs]
def make_benchmark_operations_budget(
        refstr,
        reffiles,
        devstr,
        devfiles,
        ref_interval,
        dev_interval,
        spcdb_files,
        benchmark_type=None,
        label=None,
        col_sections=["Full", "Trop", "PBL", "FixedLevs", "Strat"],
        operations=["Chemistry", "Convection", "EmisDryDep",
                    "Mixing", "Transport", "WetDep"],
        compute_accum=True,
        compute_restart=False,
        require_overlap=False,
        dst='.',
        species=None,
        overwrite=True,
        verbose=False,
):
    """
    Prints the "operations budget" (i.e. change in mass after
    each operation) from a GEOS-Chem benchmark simulation.

    Parameters
    ----------
    refstr : str
        Labels denoting the "Ref" versions.
    reffiles : list of str
        Lists of files to read from the "Ref" version.
    devstr : str
        Labels denoting the "Dev" versions.
    devfiles : list of str
        Lists of files to read from "Dev" version.
    ref_interval : float
        Number of seconds in the ref diagnostic interval.
    dev_interval : float
        Number of seconds in the dev diagnostic interval.
    spcdb_files : list
        Paths to species_database.yml files in Ref & Dev rundirs.
    benchmark_type : str, optional
        A string denoting the type of benchmark output to plot, options are
        FullChemBenchmark, TransportTracersBenchmark, or CH4Benchmark.
        Default value: None
    label : str, optional
        Contains the date or date range for each dataframe title.
        Default value: None
    col_sections : list of str, optional
        List of column sections to calculate global budgets for. May
        include Strat eventhough not calculated in GEOS-Chem, but Full
        and Trop must also be present to calculate Strat.
        Default value: ["Full", "Trop", "PBL", "FixedLevs", "Strat"]
    operations : list of str, optional
        List of operations to calculate global budgets for. Accumulation
        should not be included. It will automatically be calculated if
        all GEOS-Chem budget operations are passed and optional arg
        compute_accum is True.
        Default value: ["Chemistry","Convection","EmisDryDep",
        "Mixing","Transport","WetDep"]
    compute_accum : bool, optional
        Optionally turn on/off accumulation calculation. If True, will
        only compute accumulation if all six GEOS-Chem operations budgets
        are computed. Otherwise a message will be printed warning that
        accumulation will not be calculated.
        Default value: True
    compute_restart : bool, optional
        Optionally turn on/off calculation of mass change based on restart
        file. Only functional for "Full" column section.
        Default value: False
    require_overlap : bool, optional
        Whether to calculate budgets for only species that are present in
        both Ref or Dev.
        Default value: False
    dst : str, optional
        Directory where plots & tables will be created.
        Default value: '.' (directory in which function is called)
    species : list of str, optional
        List of species for which budgets will be created.
        Default value: None (all species)
    overwrite : bool, optional
        Denotes whether to overwrite existing budget file.
        Default value: True
    verbose : bool, optional
        Set this switch to True if you wish to print out extra
        informational messages.
        Default value: False
    """
    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # ------------------------------------------
    # Column sections
    # ------------------------------------------

    # Print info. Only allow Strat if Trop and Full are present
    print("Column sections:")
    for col_section in col_sections:
        print(f"  {col_section}")
    n_sections = len(col_sections)
    compute_strat = False
    if "Strat" in col_sections:
        compute_strat = True
        if "Full" not in col_sections or "Trop" not in col_sections:
            msg = "Strat budget table requires Full and Trop column sections"
            raise ValueError(msg)
        print("*** Will compute Strat budget as difference of Full"
              " and Trop ***")

    # Make a subset of column sections not including Strat
    gc_sections = [r for r in col_sections if "Strat" not in r]

    # ------------------------------------------
    # Operations
    # ------------------------------------------

    # Rename the optional argument to be clear it is GEOS-Chem budget
    # operations
    gc_operations = operations

    # Handle whether to compute accumulation.
    all_operations = gc_operations
    if compute_accum and len(gc_operations) == 6:
        all_operations = gc_operations + ["ACCUMULATION"]
    if compute_restart:
        all_operations = gc_operations + ["RESTART"]
    n_ops = len(all_operations)

    # Print info
    print("Operations:")
    for all_operation in all_operations:
        print(f"  {all_operation}")
    if compute_accum:
        if "ACCUMULATION" in all_operations:
            print("*** Will compute ACCUMULATION operation as sum of all "
                  "GEOS-Chem operations ***")
        else:
            print("***Will not compute ACCUMULATION since not all GEOS-Chem"
                  " operation budgets will be computed.")
    if compute_restart:
        print("*** Will compute RESTART operation as mass change "
              "based on simulation start and end restart files ***")

    # ------------------------------------------
    # Read data
    # ------------------------------------------

    # Assume this will be annual budget if interval greater than 3e7 sec
    annual = ref_interval > 3.0e7

    # Read data from disk (either one month or 12 months)
    print('Opening ref and dev data')
    skip_vars = SKIP_THESE_VARS
    if annual:
        ref_ds = xr.open_mfdataset(reffiles, drop_variables=skip_vars)
        dev_ds = xr.open_mfdataset(devfiles, drop_variables=skip_vars)
    else:
        ref_ds = xr.open_dataset(reffiles, drop_variables=skip_vars)
        dev_ds = xr.open_dataset(devfiles, drop_variables=skip_vars)

    # TODO: Add section for reading files for computing mass from restart file

    # ------------------------------------------
    # Species
    # ------------------------------------------

    # Get information about variables in data files
    vardict = compare_varnames(ref_ds, dev_ds, quiet=True)
    refonly = vardict["refonly"]
    devonly = vardict["devonly"]
    cmnvars = vardict["commonvars2D"]

    # Reduce each list to only variables containing "Budget" and not "Strat"
    refonly = [v for v in refonly if "Budget" in v and "Strat" not in v]
    devonly = [v for v in devonly if "Budget" in v and "Strat" not in v]
    cmnvars = [v for v in cmnvars if "Budget" in v and "Strat" not in v]

    # Special handling for fixed level budget diagnostic
    # Get variable name prefix, e.g. Levs1to35. Check that all fixed level
    # vars have the same prefix. Update section names used in table.
    fixedlevvars = [v for v in cmnvars if "Budget" in v and "Levs" in v]
    if fixedlevvars is not None:
        fixedlevnames = [v[v.index('Levs'):].split("_")[0] for v in fixedlevvars]
        if len(set(fixedlevnames)) > 1:
            msg = "Budget fixed level diagnostic name must be constant!"
            raise ValueError(msg)
        col_sections = [v.replace('FixedLevs',fixedlevnames[0]) for v in col_sections]
        gc_sections = [v.replace('FixedLevs',fixedlevnames[0]) for v in gc_sections]

    # Get the species list, depending on if species was passed as argument.
    if species is not None:
        spclist = species
    else:
        # For each column section, get the union or intersection (depending
        # on optional argument require_overlap) of all budget diagnostic
        # species and sort alphabetically. A species is counted even if only
        # present for as few as one operation.
        spclists = {}
        for gc_section in gc_sections:
            refspc = [v.split("_")[1] for v in refonly if gc_section in v]
            devspc = [v.split("_")[1] for v in devonly if gc_section in v]
            cmnspc = [v.split("_")[1] for v in cmnvars if gc_section in v]
            if require_overlap:
                section_spc = list(set(cmnspc))
            else:
                section_spc = list(set(refspc + devspc + cmnspc))
            section_spc.sort()
            spclists[gc_section] = section_spc

        # If calculating Strat, use intersection of Trop and Full
        if "Strat" in col_sections:
            spclists["Strat"] = list(set(spclists["Trop"] + spclists["Full"]))

        # For now, define one species list as species combined across all
        # column sections. Compute budgets of these species for each section.
        # NOTE: eventually would want to be able to compute budget for only
        # species per column section, not all species for all section. If that
        # is implemented then handling of optional species list needs to be
        # revisited.
        spclist = []
        for s in spclists:
            spclist = spclist + spclists[s]

    # Make list items unique and put in alphabetical order
    spclist = list(set(spclist))
    spclist.sort()
    n_spc = len(spclist)

    # ------------------------------------------
    # Concentration units and if a wetdep species
    # ------------------------------------------

    # Load a YAML file containing species properties
    ref_metadata, dev_metadata = read_species_metadata(
        spcdb_files,
        quiet=True
    )

    # Determine what the converted units and conversion factor should be
    # based on benchmark type and species (tracer) name. Assume raw data [kg/s]
    ref_conv_fac = {}
    dev_conv_fac = {}
    units = {}
    is_wetdep = {}
    for spc in spclist:

        # Identify wetdep species
        is_wetdep[spc] = None
        ref_species_metadata = ref_metadata.get(spc)
        dev_species_metadata = dev_metadata.get(spc)
        if ref_species_metadata is not None and \
           dev_species_metadata is not None:
            is_wetdep[spc] = \
                ref_species_metadata.get("Is_WetDep") or \
                dev_species_metadata.get("Is_WetDep")

        # Unit conversion factors and units
        ref_conv_fac[spc] = ref_interval * 1e-6
        dev_conv_fac[spc] = dev_interval * 1e-6
        units[spc] = '[Gg]'
        if benchmark_type is not None:
            if "TransportTracers" in benchmark_type and "Tracer" not in spc:
                ref_conv_fac[spc] = ref_interval
                dev_conv_fac[spc] = dev_interval
                if annual:
                    units[spc] = '[kg/yr]'
                else:
                    units[spc] = '[kg]'
            elif annual:
                ref_conv_fac[spc] = ref_interval * 1e-9
                dev_conv_fac[spc] = dev_interval * 1e-9
                units[spc] = '[Tg/yr]'

    # ------------------------------------------
    # Create dataframe
    # ------------------------------------------
    columns = ["Column_Section", "Species", "Operation", "Ref_raw", "Dev_raw",
               "Units_converted", "Ref", "Dev", "Diff", "Pct_diff"]

    # Make column data to initialize with
    col_section = list(itertools.chain.from_iterable(
        itertools.repeat(x, n_ops * n_spc) for x in col_sections))
    col_spc = list(itertools.chain.from_iterable(
        itertools.repeat(x, n_ops) for x in spclist)) * n_sections
    col_ops = all_operations * n_sections * n_spc

    # Put the column data together into a dictionary
    data = {
        'Species': col_spc,
        'Operation': col_ops,
        'Column_Section': col_section,
    }

    # Create the dataframe from the data dictionary and column names list
    df = pd.DataFrame(data, columns=columns)

    # ------------------------------------------
    # Populate dataframe for GEOS-Chem operations and column sections
    # ------------------------------------------
    print('Calculating budgets for all data operations and column sections...')

    # Loop over sections (only those with data in files)
    for gc_section in gc_sections:

        # Keep track of progress in log
        print(f"  {gc_section}")

        # Loop over species in that section
        for i, spc in enumerate(spclist):

            # Keep track of progress (debugging print)
            #if (i + 1) % 50 == 0:
            #    print(f"  {gc_section}: species {i + 1} of {n_spc}")

            # Loop over operations (only those with data in files)
            for gc_operation in gc_operations:

                # Get the dataframe row to fill. Skip if not found.
                dfrow = (df["Column_Section"] == gc_section) \
                    & (df["Species"] == spc) \
                    & (df["Operation"] == gc_operation)
                if not any(dfrow):
                    continue

                # Get the variable name in the datasets
                varname = "Budget" + gc_operation + gc_section + "_" + spc

                # Calculate Ref and dev raw as global sum
                if varname in ref_ds.data_vars.keys():
                    refraw = ref_ds[varname].values.sum()
                elif gc_operation == "WetDep" and is_wetdep[spc] is None:
                    refraw = 0.0
                else:
                    refraw = np.nan
                if varname in dev_ds.data_vars.keys():
                    devraw = dev_ds[varname].values.sum()
                elif gc_operation == "WetDep" and is_wetdep[spc] is None:
                    devraw = 0.0
                else:
                    devraw = np.nan

                # Convert units
                refconv = refraw * ref_conv_fac[spc]
                devconv = devraw * dev_conv_fac[spc]

                # Calculate diff and % diff from conc with converted units
                if not np.isnan(refconv) and not np.isnan(devconv):
                    diff = devconv - refconv
                    try:
                        pctdiff = diff / refconv * 100
                    except BaseException:
                        pctdiff = np.nan
                else:
                    diff = np.nan
                    pctdiff = np.nan

                # Fill dataframe
                df.loc[dfrow, "Ref_raw"] = refraw
                df.loc[dfrow, "Dev_raw"] = devraw
                df.loc[dfrow, "Units_converted"] = units[spc]
                df.loc[dfrow, "Ref"] = refconv
                df.loc[dfrow, "Dev"] = devconv
                df.loc[dfrow, "Diff"] = diff
                df.loc[dfrow, "Pct_diff"] = pctdiff

    # ------------------------------------------
    # Compute Strat for each data operation (if applicable)
    # ------------------------------------------
    if compute_strat:
        print('Computing Strat budgets from Trop and Full')

        # Loop over species
        for i, spc in enumerate(spclist):

            # Keep track of progress (debugging print)
            #if (i + 1) % 50 == 0:
            #    print(f"  Strat: species {i + 1} of {n_spc}")

            # Loop over operations (only those with data in files)
            for gc_operation in gc_operations:

                # Get the strat dataframe row to fill. Skip if not found.
                dfrow = (df["Column_Section"] == "Strat") \
                    & (df["Species"] == spc) \
                    & (df["Operation"] == gc_operation)
                if not any(dfrow):
                    continue

                # Get the "Full" row to use in the calculation
                dfrow_full = (df["Column_Section"] == "Full") \
                    & (df["Species"] == spc) \
                    & (df["Operation"] == gc_operation)
                if not any(dfrow_full):
                    continue

                # Get the "Trop" row to use in the calculation
                dfrow_trop = (df["Column_Section"] == "Trop") \
                    & (df["Species"] == spc) \
                    & (df["Operation"] == gc_operation)
                if not any(dfrow_trop):
                    continue

                # Calculate strat concentrations as Full minus Trop. Do
                # not bother with raw value computation.
                refstrat = df.loc[dfrow_full, "Ref"].values[0] \
                    - df.loc[dfrow_trop, "Ref"].values[0]
                devstrat = df.loc[dfrow_full, "Dev"].values[0] \
                    - df.loc[dfrow_trop, "Dev"].values[0]

                # Calculate diff and % diff
                if not np.isnan(refstrat) and not np.isnan(devstrat):
                    diff = devstrat - refstrat
                    try:
                        pctdiff = diff / refstrat * 100
                    except BaseException:
                        pctdiff = np.nan
                else:
                    diff = np.nan
                    pctdiff = np.nan

                # Fill dataframe
                df.loc[dfrow, "Units_converted"] = units[spc]
                df.loc[dfrow, "Ref"] = refstrat
                df.loc[dfrow, "Dev"] = devstrat
                df.loc[dfrow, "Diff"] = diff
                df.loc[dfrow, "Pct_diff"] = pctdiff

    # ------------------------------------------
    # Compute Accumulation for each column section (if applicable)
    # ------------------------------------------
    if compute_accum:
        print('Computing ACCUMULATION operation budgets...')

        # Loop over all column sections
        for col_section in col_sections:

            # Keep track of progress in log
            print(f"  {col_section}")

            # Loop over species
            for i, spc in enumerate(spclist):

                # Keep track of progress (debugging print)
                #if (i + 1) % 50 == 0:
                #    print(f"  {col_section}: species {i + 1} of {n_spc}")

                # Get the accumulation dataframe row to fill.Skip if not found.
                dfrow = (df["Column_Section"] == col_section) \
                    & (df["Species"] == spc) \
                    & (df["Operation"] == "ACCUMULATION")
                if not any(dfrow):
                    continue

                # Get the rows to sum
                dfrows_to_sum = (df["Column_Section"] == col_section) \
                    & (df["Species"] == spc) \
                    & (df["Operation"].isin(gc_operations))

                # Sum the concentrations. If there is one or more NaN values
                # then set to NaN since not enough data for accumulation.
                refsum = df.loc[dfrows_to_sum, "Ref"].sum()
                devsum = df.loc[dfrows_to_sum, "Dev"].sum()

                # Calculate diff and % diff
                if not np.isnan(refsum) and not np.isnan(devsum):
                    diff = devsum - refsum
                    try:
                        pctdiff = diff / refsum * 100
                    except BaseException:
                        pctdiff = np.nan
                else:
                    diff = np.nan
                    pctdiff = np.nan

                # Fill dataframe
                df.loc[dfrow, "Units_converted"] = units[spc]
                df.loc[dfrow, "Ref"] = refsum
                df.loc[dfrow, "Dev"] = devsum
                df.loc[dfrow, "Diff"] = diff
                df.loc[dfrow, "Pct_diff"] = pctdiff

    # ------------------------------------------
    # Compute mass change in restarts for each column section (if applicable)
    # ------------------------------------------
    if compute_restart:
        print('Computing RESTART operation budgets...')

        # Read the species database files in the Ref & Dev rundirs,
        # and return a dict containing metadata for each.
        ref_metadata, dev_metadata = read_species_metadata(
            spcdb_files,
            quiet=True
        )

        # Loop over all column sections
        for col_section in col_sections:

            # Loop over species
            for i, spc in enumerate(spclist):

                # Keep track of progress
                if (i + 1) % 50 == 0:
                    print(f"  {col_section}: species {i + 1} of {n_spc}")

                # Get the accumulation dataframe row to fill. Skip if not found
                # of if not Full column section.
                dfrow = (df["Column_Section"] == "Full") \
                    & (df["Species"] == spc) \
                    & (df["Operation"] == "RESTART")
                if not any(dfrow):
                    continue

                # Get ref and dev mass

                # Get species metadata for unit conversion. If none, skip.
                ref_species_metadata = ref_metadata.get(spc)
                dev_species_metadata = dev_metadata.get(spc)
                if ref_species_metadata is None and \
                   dev_species_metadata is None:
                    continue

                # Get molecular weights
                #ref_mol_wt_g = get_molwt_from_metadata(
                #    ref_species_metadata,
                #    spc
                #)
                #dev_mol_wt_g = get_molwt_from_metadata(
                #    ref_species_metadata,
                #    spc
                #)

                # Specify target units
                target_units = "Gg"

                # ==============================================================
                # Convert units of Ref and save to a DataArray
                # (or skip if Ref contains NaNs everywhere)
                # ==============================================================
                refarray = refdata[v]
                if not np.isnan(refdata[v].values).all():
                    refarray = convert_units(
                        refarray,
                        spc,
                        ref_species_metadata,
                        target_units,
                        area_m2=met_and_masks["Ref_Area"],
                        delta_p=met_and_masks["Ref_Delta_P"],
                        box_height=met_and_masks["Ref_BxHeight"],
                    )

                # ==============================================================
                # Convert units of Dev and save to a DataArray
                # (or skip if Dev contains NaNs everywhere)
                # ==============================================================
                devarray = devdata[v]
                if not np.isnan(devdata[v].values).all():
                    devarray = convert_units(
                        devarray,
                        spc_name,
                        dev_species_metadata,
                        target_units,
                        area_m2=met_and_masks["Dev_Area"],
                        delta_p=met_and_masks["Dev_Delta_P"],
                        box_height=met_and_masks["Dev_BxHeight"],
                    )


                # Compute ref mass as end mass minus start mass in ref
                # TODO

                # Compute dev mass as end mass minus start mass in dev
                # TODO - copy above once compete. The rest should just work.

                # Calculate diff and % diff
                if not np.isnan(refmass) and not np.isnan(devmass):
                    diff = devmass - refmass
                    try:
                        pctdiff = diff / refmass * 100
                    except BaseException:
                        pctdiff = np.nan
                else:
                    diff = np.nan
                    pctdiff = np.nan

                # Fill dataframe
                df.loc[dfrow, "Units_converted"] = units[spc]
                df.loc[dfrow, "Ref"] = refmass
                df.loc[dfrow, "Dev"] = devmass
                df.loc[dfrow, "Diff"] = diff
                df.loc[dfrow, "Pct_diff"] = pctdiff

    #  Sanity check write to csv (for testing. Keep commented out otherwise)
    #df.to_csv('df.csv', na_rep='NA')

    # ------------------------------------------
    # Make budget file
    # ------------------------------------------

    # Create the target output directory hierarchy if it doesn't already exist
    make_directory(dst, overwrite)

    # Print budgets to file
    filename = f"{dst}/Budgets_After_Operations.txt"
    with open(filename, "w+", encoding=ENCODING) as f:
        print("#" * 78, file=f)
        if label is not None and benchmark_type is not None:
            print(f"{benchmark_type} budgets for {label}", file=f)
        else:
            print(f"Budgets across {ref_interval}/{dev_interval} sec", file=f)
        print("\n", file=f)
        print("NOTES:", file=f)
        msg = " - When using the non-local mixing scheme (default), "\
              "'Mixing' includes\n   emissions and dry deposition "\
              "applied below the PBL. 'EmisDryDep'\n   therefore only "\
              "captures fluxes above the PBL.\n - When using full mixing, "\
              "'Mixing' and 'EmisDryDep' are fully separated.\n - Budgets "\
              "are calculated as the sum of [kg/s] tendencies\n - Strat "\
              "budget is calculated as Full minus Trop\n - ACCUMULATION "\
              "is calculated as sum of all other operations"
        print(msg, file=f)
        print("#" * 78, file=f)
        print(file=f)

        # Loop over species
        for i, spc in enumerate(spclist):
            print(f"{spc} budgets (Ref={refstr}; Dev={devstr})", file=f)

            # Print a table for each column section
            for col_section in col_sections:

                # Get the dataframe rows. Skip if none found.
                dfrows = (df["Column_Section"] == col_section) \
                    & (df["Species"] == spc) \
                    & (df["Operation"].isin(all_operations))
                if not any(dfrows):
                    continue

                # Print dataframe subset to file
                print(f"{col_section} {units[spc]} : {spc}", file=f)
                print(tabulate(
                    df.loc[dfrows,
                           ["Operation",
                            "Ref",
                            "Dev",
                            "Diff",
                            "Pct_diff"]],
                    headers='keys',
                    tablefmt='psql',
                    showindex=False,
                    floatfmt=(".5f", ".5f", ".5f", ".5f", ".5f"),
                ), file=f
            )
            print("\n", file=f)

    # ------------------------------------------
    # Clean up
    # ------------------------------------------
    del df
    del ref_ds
    del dev_ds
    gc.collect()




[docs]
def create_benchmark_summary_table(
        refpath,
        refstr,
        refdate,
        devpath,
        devstr,
        devdate,
        collections,
        dst="./benchmark",
        overwrite=False,
        outfilename="Summary.txt",
        verbose=False,
        ref_gchp=False,
        dev_gchp=False
):
    """
    Creates a benchmark summary table that shows which data collections
    have differences.  Useful for scanning the 1-hr and 1-month benchmark
    outputs.

    Parameters
    ----------
    refpath : str
        Path to the first data set to be compared (aka "Ref").
    refstr : str
        A string that can be used to identify refdata
        (e.g. a model version number or other identifier).
    refdate : np.datetime64
        Date/time stamp used by the "Ref" data files.
    devpath : str
        Path to the second data set to be compared (aka "Dev").
    devstr : str
        A string that can be used to identify the data set specified
        by devfile (e.g. a model version number or other identifier).
    devdate : np.datetime64
        Date/time stamp used by the "Dev" data files.
    collections : list of str
        List of diagnostic collections to examine.
    dst : str, optional
        A string denoting the destination folder where the file
        containing emissions totals will be written.
        Default value: "./benchmark"
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False
    outfilename : str, optional
        Name of the text file which will contain the table of
        emissions totals.
        Default value: "Summary.txt"
    verbose : bool, optional
        Set this switch to True if you wish to print out extra
        informational messages.
        Default value: False
    ref_gchp : bool, optional
        Set to True if the "Ref" data comes from a GCHP run.
        Default value: False
    dev_gchp : bool, optional
        Set to True if the "Dev" data comes from a GCHP run.
        Default value: False

    Notes
    -----
    This method is mainly intended for model benchmarking purposes,
    rather than as a general-purpose tool.

    Species properties (such as molecular weights) are read from a
    YAML file called "species_database.yml".
    """

    # ==================================================================
    # Open file for output
    # ==================================================================

    # Replace whitespace in the ref and dev labels
    refstr = replace_whitespace(refstr)
    devstr = replace_whitespace(devstr)

    # Create the directory for output
    make_directory(dst, overwrite)

    # Create file
    try:
        f = open(os.path.join(dst, outfilename), "w", encoding=ENCODING)
    except (IOError, OSError, FileNotFoundError) as e:
        msg = f"Could not open {outfilename} for writing!"
        raise e(msg) from e

    # Title strings
    title1 = "### Benchmark summary table"
    title2 = f"### Ref = {refstr}"
    title3 = f"### Dev = {devstr}"

    # Print header to file
    print("#" * 80, file=f)
    print(f"{title1 : <77}{'###'}", file=f)
    print(f"{'###'  : <77}{'###'}", file=f)
    print(f"{title2 : <77}{'###'}", file=f)
    print(f"{title3 : <77}{'###'}", file=f)
    print("#" * 80, file=f)
    print(file=f)

    # ==================================================================
    # Read data and look differences btw Ref & Dev versions
    # ==================================================================

    # Variables to skip
    skip_vars = SKIP_THESE_VARS
    skip_vars.append("AREA")

    # Pick the proper function to read the data
    reader = dataset_reader(
        multi_files=False,
        verbose=verbose
    )

    # Loop over diagnostic files
    for col in collections:

        # Read Ref data
        refdata = reader(
            get_filepath(
                refpath,
                col,
                refdate,
                is_gchp=ref_gchp
            ),
            drop_variables=skip_vars
        )

        # Get Dev data
        devdata = reader(
            get_filepath(
                devpath,
                col,
                devdate,
                is_gchp=dev_gchp
            ),
            drop_variables=skip_vars
        )

        # Make sure that Ref and Dev datasets have the same variables.
        # Variables that are in Ref but not in Dev will be added to Dev
        # with all missing values (NaNs). And vice-versa.
        [refdata, devdata] = add_missing_variables(
            refdata,
            devdata
        )

        # Find all common variables between the two datasets
        vardict = compare_varnames(
            refdata,
            devdata,
            quiet=True
        )

        # List of differences for this collection
        diff_list = []

        # Keep track of which variables are different
        # NOTE: Use 32-point float for comparisons since this is
        # the precision used for History diagnostics.
        for v in vardict["commonvarsData"]:
            if not array_equals(
                    refdata[v],
                    devdata[v],
                    dtype=np.float32
            ):
                diff_list.append(v)

        # Drop duplicate values from diff_list
        diff_list = unique_values(diff_list, drop=[None])

        if len(diff_list) == 0:
            print("-" *  79, file=f)
            print(f"{col}: {devstr} is identical to {refstr}", file=f)
            print(file=f)
        else:
            print("-" *  79, file=f)
            print(f"{col}: {devstr} differs from {refstr}", file=f)
            print("\n  Diagnostics that differ", file=f)
            for i, v in enumerate(diff_list):
                print(f"    {v}", file=f)
                if i > 10:
                    print(f"    ... and {len(diff_list) - 10} others", file=f)
                    break
            print(file=f)

    # ==================================================================
    # Close files
    # ==================================================================
    f.close()




[docs]
def diff_list_to_text(
        refstr,
        devstr,
        diff_list,
        fancy_format=False
):
    """
    Converts a list of species/emissions/inventories/diagnostics that
    show differences between GEOS-Chem versions to a printable text
    string.

    Parameters
    ----------
    refstr : str
        Label for the Ref version.
    devstr : str
        Label for the Dev version.
    diff_list : list
        List to be converted into text.  "None" values will be dropped.
    fancy_format : bool, optional
        Set to True if you wish output text to be bookended with '###'.

    Returns
    -------
    diff_text : str
        String with concatenated list values.
    """
    verify_variable_type(diff_list, list)

    # Use "Dev" and "Ref" for inserting into a header
    if fancy_format:
        refstr = "Ref"
        devstr = "Dev"

    # Strip out duplicates from diff_list
    # Prepare a message about species differences (or alternate msg)
    diff_list = unique_values(diff_list, drop=[None])

    # Print the text
    n_diff = len(diff_list)
    if n_diff > 0:
        diff_text = f"{devstr} and {refstr} show {n_diff} differences"
    else:
        diff_text = f"{devstr} and {refstr} are identical"

    # If we are placing the text in a header, trim the length of diff_text
    # to fit.  NOTE: TABLE_WIDTH-7 leaves room for the '### ' at the start
    # of the string and the '###' at the end of the string,
    if fancy_format:
        diff_text = f"### {diff_text : <{TABLE_WIDTH-7}}{'###'}"
        diff_text = wrap_text(
            diff_text,
            width=TABLE_WIDTH
        )

    return diff_text.strip()




[docs]
def diff_of_diffs_toprow_title(config, model):
    """
    Creates the diff-of-diffs plot title for the top row of the
    six-plot output.  If the title string is too long (as empirically
    determined), then a newline will be inserted in order to prevent
    the title strings from overlapping.

    Parameters
    ----------
    config : dict
        Dictionary containing the benchmark options (as read from a
        YAML file such as 1mo_benchmark.yml, etc.)
    model : str
        The model to plot.  Accepted values are "gcc" or "gchp".

    Returns
    -------
    title : str
        The plot title string for the diff-of-diffs.
    """
    verify_variable_type(config, dict)
    verify_variable_type(model, str)
    if not "gcc" in model and not "gchp" in model:
        msg = "The 'model' argument must be either 'gcc' or 'gchp'!"
        raise ValueError(msg)

    title = (
        config["data"]["dev"][model]["version"]
        + " - "
        + config["data"]["ref"][model]["version"]
    )

    if len(title) > 40:
        title = (
            config["data"]["dev"][model]["version"]
            + " -\n"
            + config["data"]["ref"][model]["version"]
        )

    return title




[docs]
def create_benchmark_sanity_check_table(
        devpath,
        devstr,
        devdate,
        collections,
        dst="./benchmark",
        is_gchp=False,
        overwrite=False,
        outfilename="Diagnostic_Sanity_Check.txt",
        verbose=False,
):
    """
    Creates a diagnostic sanity check table that shows which diagnostic
    variables are zero or NaN everywhere.  This can help to identify
    bugs in diagnostic output.

    Parameters
    ----------
    devpath : str
        Path to the data set to be compared (aka "Dev").
    devstr : str
        A string that can be used to identify the data set specified
        by devfile (e.g. a model version number or other identifier).
    devdate : np.datetime64
        Date/time stamp used by the "Dev" data files.
    collections : list of str
        List of diagnostic collections to examine.
    dst : str, optional
        A string denoting the destination folder where the file
        containing emissions totals will be written.
        Default value: "./benchmark"
    is_gchp : bool, optional
        Set this flag to true to denote if the data is from GCHP.
    overwrite : bool, optional
        Set this flag to True to overwrite files in the
        destination folder (specified by the dst argument).
        Default value: False
    outfilename : str, optional
        Name of the text file which will contain the table of
        emissions totals.
        Default value: "Summary.txt"
    verbose : bool, optional
        Set this switch to True if you wish to print out extra
        informational messages.
        Default value: False

    Notes
    -----
    This method is mainly intended for model benchmarking purposes,
    rather than as a general-purpose tool.
    """

    # ==================================================================
    # Initial preparations
    # ==================================================================

    # Replace whitespace in the ref and dev labels
    devstr = replace_whitespace(devstr)

    # Create the directory for output (if necessary)
    make_directory(dst, overwrite)
    outfilename = os.path.join(dst, outfilename)

    # Pick the proper function to read the data
    reader = dataset_reader(
        multi_files=False,
        verbose=verbose
    )

    # Variables to skip
    skip_vars = SKIP_THESE_VARS
    skip_vars.append("AREA")

    # ==================================================================
    # Open output file and write header
    # ==================================================================
    with open(outfilename, "w", encoding=ENCODING) as ofile:

        # Title strings
        title1 = "### Benchmark diagnostic sanity check table"
        title2 = f"### Dev = {devstr}"

        # Print header to file
        print("#" * 80, file=ofile)
        print(f"{title1 : <77}{'###'}", file=ofile)
        print(f"{'###'  : <77}{'###'}", file=ofile)
        print(f"{title2 : <77}{'###'}", file=ofile)
        print("#" * 80, file=ofile)

        # ==============================================================
        # Loop over diagnostic collections and scan files
        # ==============================================================
        for col in collections:

            # Read data into an xr.DataSet object
            file_name = get_filepath(
                devpath,
                col,
                devdate,
                is_gchp=is_gchp,
            )
            dset = reader(
                file_name,
                drop_variables=skip_vars
            )

            # Determine which variables are all zeroes or NaN
            all_zeros_or_nans = []
            for var in dset.data_vars:
                data = dset[var].values
                if np.all(data == 0) or np.all(data == np.nan):
                    all_zeros_or_nans.append(var)

            # ===========================================================
            # Print results for each collection
            # ===========================================================
            print("", file=ofile)
            print("="*80, file=ofile)
            print(f"{os.path.basename(file_name)}", file=ofile)
            print("="*80, file=ofile)
            print("", file=ofile)

            if len(all_zeros_or_nans) == 0:
                print("No variables were all zero or all NaN", file=ofile)
            else:
                print("These variables were all zero or all NaN:", file=ofile)
                for var in all_zeros_or_nans:
                    print(f"   {var}", file=ofile)