Source code for gcpy.kpp.kppsa_utils

#!/usr/bin/env python3
"""
Utility functions for visualizing output from the
KPP-Standalone box model.
"""

# Imports
from os.path import expanduser, join
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gcpy.constants import ENCODING
from gcpy.util import get_element_of_series, verify_variable_type



[docs]
def kppsa_get_file_list(input_dir, pattern=""):
    """
    Returns a list of KPP-Standalone log files matching a search
    pattern.

    Parameters
    ----------
    input_dir : str
        Directory containing KPP-Standalone log files.
    pattern : str, optional
        Glob pattern used to filter filenames. All files containing
        this string will be returned.
        Default: ``""`` (matches all files)

    Returns
    -------
    file_list : list of str
        List of file paths matching the search criteria.
    """
    return glob(join(expanduser(input_dir), f"*{pattern}*"))




[docs]
def kppsa_read_one_csv_file(file_name):
    """
    Reads a single KPP-Standalone log file in CSV format into a
    pandas DataFrame.

    Metadata fields (location, timestamp, longitude, latitude,
    vertical level, and pressure) are extracted from the file
    header and appended as additional columns.

    Parameters
    ----------
    file_name : str
        Path to the CSV-format KPP-Standalone log file to be read.

    Returns
    -------
    dframe : pandas.DataFrame
        DataFrame containing species concentrations and associated
        metadata extracted from the file header. Added columns are:
        ``Location``, ``DateTime``, ``Longitude``, ``Latitude``,
        ``Level``, and ``Pressure``.
    """
    verify_variable_type(file_name, str)

    # Initialize variables
    latitude  = ""
    level     = ""
    longitude = ""
    location  = ""
    pressure  = ""
    timestamp = ""

    # Read file header contents
    with open(file_name, "r", encoding=ENCODING) as ifile:

        # Find the number of rows to skip
        skiprows = int(ifile.readline().strip()) + 1

        # Read the rest of the header contents
        for line in ifile:
            line = line.strip()

            # Extract selected metadata fields
            if "Location" in line:
                location = line.split(":")[1].strip()
            if "Timestamp" in line:
                substrs = line.split(":")
                timestamp = substrs[1].strip() + ":" + substrs[2].strip()
                timestamp = timestamp.replace("/", "-").replace(" ", "T")
            if "Longitude" in line:
                longitude = line.split(":")[1].strip()
            if "Latitude" in line:
                latitude = line.split(":")[1].strip()
            if "GEOS-Chem Vertical Level" in line:
                level = line.split(":")[1].strip()
            if "Pressure (hPa)" in line:
                pressure = line.split(":")[1].strip()

            # Exit the loop once the column header row is reached
            if "Species Name" in line:
                break

    # Read the CSV into a DataFrame object
    dframe = pd.read_csv(
        file_name,
        skiprows=skiprows,
        delimiter=",",
        dtype={
            "Initial Concentration (molec/cm3)": np.float64,
            "Final Concentration (molec/cm3)": np.float64,
        },
        engine="c"
    )

    # Add series with metadata obtained from the header
    dframe = dframe.assign(Location=location)
    dframe = dframe.assign(DateTime=np.datetime64(timestamp))
    dframe = dframe.assign(Longitude=float(longitude))
    dframe = dframe.assign(Latitude=float(latitude))
    dframe = dframe.assign(Level=int(level))
    dframe = dframe.assign(Pressure=float(pressure))

    return dframe




[docs]
def kppsa_read_csv_files(file_list):
    """
    Reads multiple KPP-Standalone log files and concatenates them
    into a single pandas DataFrame.

    Parameters
    ----------
    file_list : list of str
        Paths to KPP-Standalone CSV log files to be read.

    Returns
    -------
    dframe_all : pandas.DataFrame
        Concatenated DataFrame containing data from all files in
        ``file_list``.
    """
    dframe_all = pd.DataFrame()
    for file_name in file_list:
        dframe = kppsa_read_one_csv_file(file_name)
        dframe_all = pd.concat([dframe_all, dframe], ignore_index=True)
        del dframe

    return dframe_all




[docs]
def kppsa_prepare_site_data(dframe, site_name, species):
    """
    Returns a DataFrame and plot title for a given species and
    observation site.

    Data is filtered to include only observations at or below
    500 hPa (surface to lower stratosphere) and sorted by
    descending pressure.

    Parameters
    ----------
    dframe : pandas.DataFrame
        KPP-Standalone output data as returned by
        :func:`kppsa_read_csv_files`.
    site_name : str
        Name of the observation site to extract data for.
    species : str
        Name of the species to extract data for.

    Returns
    -------
    site_data : pandas.DataFrame or None
        DataFrame filtered to the given site and species, sorted by
        descending pressure, and limited to pressures >= 500 hPa.
        Returns ``None`` if ``dframe`` is ``None``.
    site_title : str or None
        Plot title string including the site name, cardinal
        coordinates, and timestamp (e.g.
        ``"Mace Head (53°N,10°W) at 2019-07-01T00:00"``).
        Returns ``None`` if ``dframe`` is ``None``.
    """
    # Exit if the data frame is set to None
    if dframe is None:
        return None, None

    # Get data for a given species at all locations
    site_data = dframe.loc[dframe["Location"] == site_name]
    site_data = site_data.loc[site_data["Species Name"] == species]
    site_data = site_data.loc[site_data["Pressure"] >= 500]
    site_data = site_data.sort_values(by="Pressure", ascending=False)

    # Create the top title for the subplot for this observation site
    # (use integer lon & lat values and N/S lat and E/W lon notation)
    lat  = int(round(get_element_of_series(site_data["Latitude"],  0)))
    lon  = int(round(get_element_of_series(site_data["Longitude"], 0)))
    time = get_element_of_series(site_data["DateTime"], 0)
    ystr = "S" if lat < 0 else "N"
    xstr = "W" if lon < 0 else "E"
    lon  = abs(lon)
    lat  = abs(lat)
    site_title = \
        f"{site_name.strip()} ({lat}$^\\circ${ystr},{lon}$^\\circ${xstr})"
    site_title += f" at {time}"

    return site_data, site_title




[docs]
def kppsa_plot_single_site(
        fig,
        rows_per_page,
        cols_per_page,
        subplot_index,
        subplot_title,
        ref_data,
        ref_label,
        dev_data,
        dev_label,
        species,
        font_scale,
):
    """
    Plots initial and final species concentrations as vertical
    profiles at a single observation site.

    Parameters
    ----------
    fig : matplotlib.figure.Figure
        Figure object into which the subplot will be added.
    rows_per_page : int
        Number of subplot rows on the page.
    cols_per_page : int
        Number of subplot columns on the page.
    subplot_index : int
        Zero-based index of this subplot within the page grid.
    subplot_title : str
        Title string displayed above this subplot.
    ref_data : pandas.DataFrame
        KPP-Standalone output for the ``Ref`` model version at this
        site, as returned by :func:`kppsa_prepare_site_data`.
    ref_label : str
        Legend label for the ``Ref`` model final concentration.
    dev_data : pandas.DataFrame or None
        KPP-Standalone output for the ``Dev`` model version at this
        site. Pass ``None`` to omit the Dev profile.
    dev_label : str or None
        Legend label for the ``Dev`` model final concentration.
        Ignored if ``dev_data`` is ``None``.
    species : str
        Name of the species being plotted, used for axis labels
        and error checking.
    font_scale : float
        Multiplicative scale factor applied to all font sizes,
        allowing text to be enlarged or reduced uniformly.

    Raises
    ------
    ValueError
        If ``species`` is not found in ``ref_data``.
    ValueError
        If ``dev_data`` is not ``None`` and ``species`` is not
        found in ``dev_data``.

    Notes
    -----
    The Y-axis (pressure) is plotted from 1020 hPa at the bottom
    to 500 hPa at the top. The Y-axis label is only applied to
    leftmost panels (``subplot_index`` is 0 or a multiple of
    ``cols_per_page``).
    """
    # Error checks
    if species not in list(ref_data["Species Name"]):
        raise ValueError("Species not found in Ref model output!")
    if dev_data is not None:
        if species not in list(dev_data["Species Name"]):
            raise ValueError("Species not found in Dev model output!")

    # Create matplotlib axes object for this subplot
    axes_subplot = fig.add_subplot(
        rows_per_page,
        cols_per_page,
        subplot_index + 1,
    )

    # Title for each subplot
    axes_subplot.set_title(
        subplot_title,
        weight='bold',
        fontsize=8*font_scale,
    )

    # Initial concentration
    axes_subplot.plot(
        ref_data["Initial Concentration (molec/cm3)"].astype(np.float64),
        ref_data["Pressure"],
        color='k',
        marker='o',
        markersize=3*font_scale,
        lw=1,
        label=f"Initial {species}",
    )

    # Final concentration (Ref)
    axes_subplot.plot(
        ref_data["Final Concentration (molec/cm3)"].astype(np.float64),
        ref_data["Pressure"],
        color='r',
        marker='o',
        markersize=3*font_scale,
        lw=1,
        label=f"{ref_label}",
    )

    # Final concentration (Dev)
    if dev_data is not None:
        axes_subplot.plot(
            dev_data["Final Concentration (molec/cm3)"].astype(np.float64),
            dev_data["Pressure"],
            color='b',
            marker='o',
            markersize=3*font_scale,
            lw=1,
            label=f"{dev_label}",
        )

    # Set X and Y axis labels
    axes_subplot.set_xlabel(
        f"{species} (molec/cm3)",
        fontsize=8*font_scale,
    )

    # Apply y-axis label only if this is a leftmost plot panel
    if subplot_index == 0 or subplot_index % cols_per_page == 0:
        axes_subplot.set_ylabel(
            "Pressure (hPa)",
            fontsize=8*font_scale,
        )

    # Set Y-axis range and ticks
    axes_subplot.set_ylim(1020.0, 500.0)
    axes_subplot.set_yticks([1000, 900, 800, 700, 600, 500])
    axes_subplot.tick_params(
        axis='both',
        which='major',
        labelsize=8*font_scale
    )




[docs]
def kppsa_plot_one_page(
        pdf,
        site_names,
        ref_dframe,
        ref_label,
        dev_dframe,
        dev_label,
        species="O3",
        rows_per_page=3,
        cols_per_page=3,
        font_scale=1.0,
):
    """
    Plots a single page of Ref vs. Dev vertical profiles for a
    list of observation sites and saves it to a PDF file.

    Parameters
    ----------
    pdf : matplotlib.backends.backend_pdf.PdfPages
        Open PDF file object to which the page will be saved.
    site_names : list of str
        Names of the observation sites to plot on this page.
    ref_dframe : pandas.DataFrame
        KPP-Standalone output for the ``Ref`` model version, as
        returned by :func:`kppsa_read_csv_files`.
    ref_label : str
        Legend label for the ``Ref`` model data.
    dev_dframe : pandas.DataFrame
        KPP-Standalone output for the ``Dev`` model version, as
        returned by :func:`kppsa_read_csv_files`.
    dev_label : str
        Legend label for the ``Dev`` model data.
    species : str, optional
        Name of the species to plot.
        Default: ``"O3"``
    rows_per_page : int, optional
        Number of subplot rows on the page.
        Default: ``3``
    cols_per_page : int, optional
        Number of subplot columns on the page.
        Default: ``3``
    font_scale : float, optional
        Multiplicative scale factor applied to all font sizes.
        Default: ``1.0``

    Notes
    -----
    The page is landscape-oriented (11" × 8"). A shared legend is
    placed at the top of the page. Vertical spacing between subplots
    is adjusted automatically.
    """
    # Define a new matplotlib.figure.Figure object for this page
    # Landscape width: 11" x 8"
    fig = plt.figure(figsize=(11, 8))
    fig.tight_layout()

    # Loop over all of the stations that fit on the page
    for subplot_index, site_name in enumerate(site_names):

        # Get the data for the given species and site
        ref_site_data, site_title = kppsa_prepare_site_data(
            ref_dframe,
            site_name,
            species,
        )
        dev_site_data, _ = kppsa_prepare_site_data(
            dev_dframe,
            site_name,
            species,
        )

        # Plot species vertical profile at a given site
        kppsa_plot_single_site(
            fig,
            rows_per_page,
            cols_per_page,
            subplot_index,
            site_title,
            ref_site_data,
            ref_label,
            dev_site_data,
            dev_label,
            species,
            font_scale,
        )

    # Add extra spacing around plots
    plt.subplots_adjust(hspace=0.4, top=0.9)

    # Add top-of-page legend
    plt.legend(
        ncol=3,
        bbox_to_anchor=(0.5, 0.98),
        bbox_transform=fig.transFigure,
        loc='upper center'
    )

    # Save this page to the PDF file
    pdf.savefig(fig)




[docs]
def kppsa_get_unique_site_names(dframe):
    """
    Returns a list of unique observation site names from
    KPP-Standalone output, ordered from north to south.

    Parameters
    ----------
    dframe : pandas.DataFrame
        KPP-Standalone output data as returned by
        :func:`kppsa_read_csv_files`.

    Returns
    -------
    unique_site_names : list of str
        Unique site names sorted by descending latitude
        (north to south).
    """
    # Sort the sites from north to south
    site_names = dframe[["Location", "Latitude"]].sort_values(
        by="Latitude",
        ascending=False
    )

    # Get a list of unique site names, preserving the ordering
    unique_site_names = []
    seen = set()
    for site_name in site_names["Location"]:
        if site_name not in seen:
            seen.add(site_name)
            unique_site_names.append(site_name)

    return unique_site_names