Source code for gcpy.benchmark.modules.benchmark_models_vs_obs

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Python functions to plot modeled data from 1-year fullchem benchmark
simulations against observations for the year 2019.  At present, only
O3 plots are supported, but this can be extended in the future.

References
----------
Author: Matt Rowlinson (York University)

Linted with PyLint and incorporated into GCPy by Bob Yantosca
(GitHub: @yantosca)
"""
import glob
from datetime import datetime, timedelta
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.figure import Figure
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xarray as xr
from gcpy.constants import SKIP_THESE_VARS
from gcpy.util import \
    dataset_reader, make_directory, replace_whitespace, verify_variable_type
from gcpy.cstools import extract_grid
from gcpy.grid import get_nearest_model_data
from gcpy.benchmark.modules.benchmark_utils import \
    get_geoschem_level_metadata, rename_speciesconc_to_speciesconcvv


[docs] def read_nas( input_file, verbose=False, ): """ Read NASA Ames data files from EBAS (https://ebas-data.nilu.no) Creates data frame of O3 values converted to ppb and dictionary with key site information (name, lat, lon, altitude) Parameters ---------- input_file : str Path to observational data file. verbose : bool, optional Toggles verbose output on/off. Returns ------- obs_dataframe : pd.DataFrame Observations at each station site. obs_site_coords : dict Coords (lon/lat/alt) at each site. """ verify_variable_type(input_file, str) if verbose: print(f"read_nas: Reading {input_file}") # Initialize lon = 0.0 lat = 0.0 alt = 0.0 with open(input_file, encoding='UTF-8') as the_file: header = np.array( [next(the_file) for x in range(155) ] ) n_hdr = int( header[0].split(' ')[0] ) st_ymd = header[6].split(' ') st_ymd = list( filter( None, st_ymd ) ) start_date = datetime( int(st_ymd[0]), int(st_ymd[1]), int(st_ymd[2]) ) for line in header: if 'Station name' in line: site = line.split(':')[1:] site = '_'.join(site).replace('\n','').\ replace(' ',' ').replace('/','-') site = site.replace('Atmospheric Observatory','') site = site.replace(' Research Station','') elif 'Station longitude:' in line: lon = float(line.split(' ')[-1].replace('\n','')) elif 'Station latitude:' in line: lat = float(line.split(' ')[-1].replace('\n','')) elif 'Station altitude:' in line: alt = float(line.split(' ')[-2].replace('\n','')) file_hdr = np.loadtxt( input_file, skiprows=n_hdr ) obs_dataframe = pd.DataFrame( file_hdr, index=file_hdr[:,0] ) obs_dataframe, qcflag = find_times( obs_dataframe, start_date ) obs_dataframe = pd.DataFrame( { 'Value': obs_dataframe.values/1.99532748, 'Flag': qcflag }, index=obs_dataframe.index ) obs_dataframe = obs_dataframe[obs_dataframe.Flag == 0.000] obs_dataframe = obs_dataframe.loc['2019'] obs_dataframe = obs_dataframe.resample('H').mean() obs_dataframe = pd.DataFrame( { site: obs_dataframe.Value }, index=obs_dataframe.index ) obs_site_coords = { site: { 'lon': lon, 'lat': lat, 'alt': alt } } return obs_dataframe, obs_site_coords
[docs] def read_observational_data( path, verbose ): """ Reads the observational O3 data from EBAS (taken from https://ebas-data.nilu.no/ on 15/05/2023) Loops over all data files (in NASA/Ames format) within a folder and concatenates them into a single DataFrame. Parameters ---------- path : str Path to the observational data dir. verbose : bool Toggles verbose output on/off. Returns ------- obs_dataframe : pd.DataFrame Observations at each station site. obs_site_coords : dict Coords (lon/lat/alt) at each site. """ verify_variable_type(path, str) first = True obs_site_coords = {} dataframe = None for infile in sorted(glob.glob(f"{path}/*nas")): obs_dataframe, xyz = read_nas( infile, verbose=verbose ) if first: dataframe = obs_dataframe obs_site_coords.update(xyz) first = False else: dataframe = pd.concat( [dataframe, obs_dataframe], axis=1 ) obs_site_coords.update(xyz) # If dataframe0 is undefined, the loop didn't execute... so throw error if dataframe is None: raise ValueError(f"Could not find data in {path}!") obs_dataframe = dataframe.groupby( dataframe.columns, axis=1 ).max() return obs_dataframe, obs_site_coords
[docs] def read_model_data( filepaths, varname, verbose=False, ): """ Reads model data from a netCDF file. Adds special handling to look for species concentrations variable names starting with either "SpeciesConcVV" or "SpeciesConc". This is necessary for backwards compatitbility with GEOS-Chem output prior to version 14.1.0. Parameters ---------- filepaths : list List of GEOS-Chem data files to read. varname : str Variable name(s) to read. verbose : bool, optional Toggles verbose output on/off. Returns ------- dataarray : xr.DataArray GEOS-Chem data read from disk. """ # Read the Ref and Dev model data reader = dataset_reader( multi_files=True, verbose=verbose, ) # Read data and rename SpeciesConc_ to SpeciesConcVV_, if necessary # (needed for backwards compatibility with older versions.) dataset = reader(filepaths,drop_variables=SKIP_THESE_VARS) dataset = rename_speciesconc_to_speciesconcvv(dataset) # Create a DataArray object and convert to ppbv (if necessary) with xr.set_options(keep_attrs=True): dataarray = dataset[varname] if "mol mol-1" in dataarray.attrs["units"]: dataarray.values *= 1.0e9 dataarray.attrs["units"] = "ppbv" return dataarray
[docs] def find_times( obs_dataframe, start_time ): """ Convert timestamps in nasa ames data files to python datetime objects and set DataFrame index to the new datetime array. Parameters ---------- obs_dataframe : pd.DataFrame Observations from GAW sites. start_time : str Reference start time for obs data. Returns ------- obs_dataframe : pd.DataFrame Observations (ppbv) w/ datetime index. qcflag : pd.DataFrame Quality control info w/ datetime index. start_time : str Reference start time for obs data. """ end_time = obs_dataframe[obs_dataframe.columns[1]] time_x = [] for index in range(len(end_time)): time_x.append(start_time + timedelta(days=end_time.values[index])) obs_dataframe.index = time_x qcflag =obs_dataframe[obs_dataframe.columns[-1]] obs_dataframe = obs_dataframe[obs_dataframe.columns[2]] return obs_dataframe, qcflag
[docs] def get_nearest_model_data_to_obs( gc_data, gc_levels, lon_value, lat_value, alt_value, gc_cs_grid=None ): """ Returns GEOS-Chem model data (on a cubed-sphere grid) at the grid box closest to an observation site location. Parameters ---------- gc_data : pd.DataFrame Observations at each station site. gc_levels : pd.DataFrame Metadata for model vertical levels. lon_value : float Longitude of station site. lat_value : float Latitude of station site. alt_value : float Altitude of station site. gc_cs_grid : xr.Dataset, optional Metadata for Dev cubed-sphere grid. Returns ------- dataframe : pd.DataFrame Model data closest to the station site. """ verify_variable_type(gc_data, xr.DataArray) verify_variable_type(gc_cs_grid, (xr.Dataset, type(None))) verify_variable_type(gc_levels, pd.DataFrame) # Prevent the latitude from getting too close to the N or S poles lat_value = max(min(lat_value, 89.75), -89.75) # Nearest GEOS-Chem data to (lat, lon) of observation dframe = get_nearest_model_data( gc_data, lon_value, lat_value, gc_cs_grid=gc_cs_grid ) dframe = dframe.reset_index() # Nearest GEOS-Chem level to observation gc_alts = gc_levels["Altitude (m)"].values n_alts = len(gc_alts) z_idx =(np.abs(gc_alts - float(alt_value))).argmin() # Pick out elements of the dataframe at the nearest level to obs rows = np.zeros(len(dframe), dtype=bool) good = [(v * n_alts) + z_idx for v in range(12)] rows[good] = True return dframe[rows].set_index("time")
[docs] def prepare_data_for_plot( obs_dataframe, obs_site_coords, obs_site_name, ref_dataarray, ref_cs_grid, dev_dataarray, dev_cs_grid, gc_levels, varname="SpeciesConcVV_O3", ): """ Prepares data for passing to routine plot_single_frames as follows: (1) Computes the mean of observations at the given station site. (2) Returns the GEOS-Chem Ref and Dev data at the gridbox closest to the given station site. (3) Creates the top-of-plot title for the given station site. Parameters ---------- obs_dataframe : pd.DataFrame Observations at each station site. obs_site_coords : dict Coords (lon/lat/alt) at each site. obs_site_name : str Names of station sites. ref_dataarray : xr.DataArray Data from the Ref model version. ref_label : str Label for the Ref model data. ref_cs_grid : xr.Dataset Metadata for Ref cubed-sphere grid. dev_dataarray : xr.DataArray Data from the Dev model version. dev_label : str Label for the Dev model data. dev_cs_grid : xr.Dataset Metadata for Dev cubed-sphere grid. gc_levels : pd.DataFrame Metadata for model vertical levels. varname : str, optional Variable name for model data. Returns ------- obs_dataframe : pd.DataFrame Mean observational data @ station site. ref_series : pd.Series Ref data nearest to the station site. dev_series : pd.Series Dev data nearest to the station site. subplot_title : str Title for the station site subplot. subplot_ylabel : str Y-axis title for the station site subplot. """ verify_variable_type(obs_dataframe, pd.DataFrame) verify_variable_type(obs_site_coords, dict) verify_variable_type(obs_site_name, str) verify_variable_type(ref_dataarray, xr.DataArray) verify_variable_type(dev_dataarray, xr.DataArray) verify_variable_type(ref_cs_grid, (xr.Dataset, type(None))) verify_variable_type(dev_cs_grid, (xr.Dataset, type(None))) verify_variable_type(gc_levels, pd.DataFrame) verify_variable_type(varname, str) # Get data from the Ref model closest to the data site coords = [ round(obs_site_coords[obs_site_name]['lon'], 2), round(obs_site_coords[obs_site_name]['lat'], 2), round(obs_site_coords[obs_site_name]['alt'], 1) ] # Get data from the Ref model closest to the obs site ref_dataframe = get_nearest_model_data_to_obs( ref_dataarray, gc_levels, coords[0], # Obs site lon coords[1], # Obs site lat coords[2], # Obs site alt gc_cs_grid=ref_cs_grid ) # Get data from the Dev model closest to the obs site dev_dataframe = get_nearest_model_data_to_obs( dev_dataarray, gc_levels, coords[0], # Obs site lon coords[1], # Obs site lat coords[2], # Obs site alt gc_cs_grid=dev_cs_grid, ) # Take the monthly mean of observations for plotting # (since some observation sites have multiple months of data) obs_dataframe = obs_dataframe.resample('M').mean() # Create the top title for the subplot for this observation site # (use integer lon & lat values and N/S lat and E/W lon notation) lon = int(round(obs_site_coords[obs_site_name]['lon'], 0)) lat = int(round(obs_site_coords[obs_site_name]['lat'], 0)) ystr = "S" if lat >= 0: ystr = "N" xstr = "W" if lon >= 0: xstr = "E" lon = abs(lon) lat = abs(lat) subplot_title = \ f"{obs_site_name.strip()} ({lat}$^\\circ${ystr},{lon}$^\\circ${xstr})" # Y-axis label (i.e. species name) subplot_ylabel = varname.split("_")[1] + " (ppbv)" return obs_dataframe, ref_dataframe[varname], dev_dataframe[varname], \ subplot_title, subplot_ylabel
[docs] def plot_single_station( fig, rows_per_page, cols_per_page, subplot_index, subplot_title, subplot_ylabel, obs_dataframe, obs_label, obs_site_name, ref_series, ref_label, dev_series, dev_label ): """ Plots observation data vs. model data at a single station site. Parameters ---------- fig : mpl.figure.Figure Figure object for the plot. rows_per_page : int # of rows to plot on a page. cols_per_page : int # of columns to plot on a page. subplot_index : int Index of each subplot. subplot_title : str Title for each subplot. subplot_ylabel : str Y-axis title for each subplot. obs_dataframe : pd.DataFrame Observations at each station site. obs_site_name : str Name of the station site. ref_series : pd.Series Data from the Ref model version. ref_label : str Label for the Ref model data. dev_dataarray : pd.Series Data from the Dev model version. dev_label : str Label for the Dev model data. """ verify_variable_type(fig, Figure) verify_variable_type(rows_per_page, int) verify_variable_type(cols_per_page, int) verify_variable_type(subplot_index, int) verify_variable_type(subplot_title, str) verify_variable_type(subplot_ylabel, str) verify_variable_type(obs_dataframe, pd.DataFrame) verify_variable_type(ref_series, pd.Series) verify_variable_type(ref_label, str) verify_variable_type(dev_series, pd.Series) verify_variable_type(dev_label, str) # Create matplotlib axes object for this subplot # axes_subplot is of type matplotlib.axes_.subplots.AxesSubplot axes_subplot = fig.add_subplot( rows_per_page, cols_per_page, subplot_index + 1, ) # Set title for top of each frame axes_subplot.set_title( f"{subplot_title}", weight='bold', fontsize=7 ) ## Plot observational data axes_subplot.plot( obs_dataframe.index, obs_dataframe[obs_site_name], color='k', marker='^', markersize=4, lw=1, label=f"{obs_label}" ) # Plot model data axes_subplot.plot( obs_dataframe.index, ref_series, color='r', marker='o', markersize=3, lw=1, label=ref_label ) axes_subplot.plot( obs_dataframe.index, dev_series, color='g', marker='s', markersize=3, lw=1, label=dev_label ) # Apply y-axis label only if this is a leftmost plot panel if subplot_index == 0 or subplot_index % cols_per_page == 0: axes_subplot.set_ylabel( subplot_ylabel, fontsize=8 ) # Set X-axis and Y-axis ticks and labels axes_subplot.set_xticks( obs_dataframe.index ) # NOTE: In newer versions of matplotlib you can pass the # xticklabels keyword to the set_xticks function. But we need # to set the xticklabels separately for backwards compatibility # with older matplotlib versions. -- Bob Yantosca (06 Jul 2023) axes_subplot.set_xticklabels( ['J','F','M','A','M','J','J','A','S','O','N','D'] ) axes_subplot.set_ylim( 0, 80 ) axes_subplot.set_yticks( [0, 20, 40, 60, 80] ) axes_subplot.tick_params( axis='both', which='major', labelsize=6 )
[docs] def plot_one_page( pdf, obs_dataframe, obs_label, obs_site_coords, obs_site_names, ref_dataarray, ref_label, ref_cs_grid, dev_dataarray, dev_label, dev_cs_grid, gc_levels, rows_per_page=3, cols_per_page=3, varname="SpeciesConcVV_O3", ): """ Plots a single page of models vs. observations. Parameters ---------- obs_dataframe : pd.DataFrame Observations at each station site. obs_label : str Label for the observational data. obs_site_coords : dict Coords (lon/lat/alt) at each site. obs_site_names : list Names of station sites per page. ref_dataarray : xr.DataArray Data from the Ref model version. ref_label : str Label for the Ref model data. ref_cs_grid : str or None Metadata for Ref cubed-sphere grid. dev_dataarray : xr.DataArray Data from the Dev model version. dev_label : str Label for the Dev model data. dev_cs_grid : str or None Metadata for Dev cubed-sphere grid. gc_levels : pd.DataFrame Metadata for model vertical levels. rows_per_page : int, optional Number of rows to plot on a page. varname : str, optional Variable name for model data. """ verify_variable_type(obs_dataframe, pd.DataFrame) verify_variable_type(obs_site_coords, dict) verify_variable_type(obs_site_names, list) verify_variable_type(ref_dataarray, xr.DataArray) verify_variable_type(ref_label, str) verify_variable_type(ref_cs_grid, (xr.Dataset, type(None))) verify_variable_type(dev_dataarray, xr.DataArray) verify_variable_type(dev_label, str) verify_variable_type(dev_cs_grid, (xr.Dataset, type(None))) verify_variable_type(gc_levels, pd.DataFrame) # Define a new matplotlib.figure.Figure object for this page # Landscape width: 11" x 8" fig = plt.figure(figsize=(11, 8)) fig.tight_layout() # Loop over all of the stations that fit on the page for subplot_index, obs_site_name in enumerate(obs_site_names): # Find the model Ref & Dev data closest to the observational # station site. Also take monthly average of observations, obs_dataframe, \ ref_series, dev_series, \ subplot_title, subplot_ylabel \ = prepare_data_for_plot( obs_dataframe, # pandas.DataFrame obs_site_coords, # dict obs_site_name, # str ref_dataarray, # xarray.DataArray ref_cs_grid, # dict or none dev_dataarray, # xarray.DataArray dev_cs_grid, # dict or none gc_levels, # pandas.DataFrame varname=varname # str ) # Plot models vs. observation for a single station site plot_single_station( fig, # matplotlib.figure.Figure rows_per_page, # int cols_per_page, # int subplot_index, # int subplot_title, # str subplot_ylabel, # str obs_dataframe, # pandas.Dataframe obs_label, # str obs_site_name, # str ref_series, # pandas.Series ref_label, # str dev_series, # pandas.Series dev_label, # str ) # Add extra spacing around plots plt.subplots_adjust( hspace=0.4, top=0.9 ) # Add top-of-page legend plt.legend( ncol=3, bbox_to_anchor=(0.5, 0.98), bbox_transform=fig.transFigure, loc='upper center' ) # Save this page to the PDF file pdf.savefig(fig)
[docs] def plot_models_vs_obs( obs_dataframe, obs_label, obs_site_coords, ref_dataarray, ref_label, dev_dataarray, dev_label, gc_levels, varname="SpeciesConcVV_O3", dst="./benchmark", ): """ Plots models vs. observations using a 3 rows x 3 column layout. Parameters ---------- obs_dataframe : pd.DataFrame Observations at each station site. obs_label : str Label for the observational data. obs_site_coords : dict Coords (lon/lat/alt) at each site. ref_dataarray : xr.DataArray Data from the Ref model version. ref_label : str Label for the Ref model data. dev_dataarray : xr.DataArray Data from the Dev model version. dev_label : str Label for the Dev model data. gc_levels : pd.DataFrame Metadata for model vertical levels. varname : str, optional Variable name for model data. dst : str, optional Destination folder for plots. """ verify_variable_type(obs_dataframe, pd.DataFrame) verify_variable_type(obs_site_coords, dict) verify_variable_type(ref_dataarray, xr.DataArray) verify_variable_type(ref_label, str) verify_variable_type(dev_dataarray, xr.DataArray) verify_variable_type(dev_label, str) verify_variable_type(gc_levels, pd.DataFrame) # Get the cubed-sphere grid definitions for Ref & Dev # (will be returned as "None" for lat/lon grids) ref_cs_grid = extract_grid(ref_dataarray) dev_cs_grid = extract_grid(dev_dataarray) # Figure setup plt.style.use("seaborn-v0_8-darkgrid") rows_per_page = 3 cols_per_page = 3 plots_per_page = rows_per_page * cols_per_page # Open the plot as a PDF document pdf_file = f"{dst}/models_vs_obs.surface.{varname.split('_')[1]}.pdf" pdf = PdfPages(pdf_file) # Sort station sites N to S latitude order according to: # https://www.geeksforgeeks.org/python-sort-nested-dictionary-by-key/ # NOTE: obs_site_names will be a MultiIndex list (a list of tuples) obs_site_names = sorted( obs_site_coords.items(), key = lambda x: x[1]['lat'], reverse=True ) # Convert obs_site_names from a MultiIndex list to a regular list obs_site_names = [list(tpl)[0] for tpl in obs_site_names] # Loop over the number of obs sites that fit on a page for start in range(0, len(obs_site_names), plots_per_page): end = start + plots_per_page - 1 # Plot obs sites that fit on a single page plot_one_page( pdf, # PdfPages obs_dataframe, # pandas.DataFrame obs_label, # str obs_site_coords, # dict obs_site_names[start:end+1], # list of str ref_dataarray, # xarray.DataArray ref_label, # str ref_cs_grid, # xarray.DataSet or NoneType dev_dataarray, # xarray.DataArray dev_label, # str dev_cs_grid, # xarray.Dataset or NoneType gc_levels, # pandas.DataFrame rows_per_page=rows_per_page, # int cols_per_page=cols_per_page, # int varname=varname, # str ) # Close the PDF file after all pages are plotted. pdf.close() # Reset the plot style (this prevents the seaborn style from # being applied to other model vs. obs plotting scripts) plt.style.use("default")
[docs] def make_benchmark_models_vs_obs_plots( obs_filepaths, obs_label, ref_filepaths, ref_label, dev_filepaths, dev_label, varname="SpeciesConcVV_O3", dst="./benchmark", verbose=False, overwrite=False ): """ Driver routine to create model vs. observation plots. Parameters ---------- obs_filepaths : str or list Path(s) to the observational data. obs_label : str Label for the observational data. ref_filepaths : str Paths to the Ref model data. ref_label : str Label for the Ref model data. dev_filepaths : str Paths to the Dev model data. dev_label : str Label for the Dev model data. varname : str, optional Variable name for model data. dst : str, optional Destination folder for plots. verbose : bool, optional Toggles verbose output on/off. overwrite : bool, optional Toggles overwriting contents of dst. """ verify_variable_type(obs_filepaths, (str, list)) verify_variable_type(ref_filepaths, (str, list)) verify_variable_type(ref_label, str) verify_variable_type(dev_filepaths, (str, list)) verify_variable_type(dev_label, str) # Replace whitespace in the ref and dev labels ref_label = replace_whitespace(ref_label) dev_label = replace_whitespace(dev_label) # Create the destination folder make_directory( dst, overwrite=overwrite ) # Get GEOS-Chem level metadata gc_levels = get_geoschem_level_metadata( search_key=["Altitude (km)", "Eta Mid"] ) gc_levels["Altitude (m)"] = gc_levels["Altitude (km)"] * 1000.0 # Read the observational data obs_dataframe, obs_site_coords = read_observational_data( obs_filepaths, verbose=verbose ) # Read the model data ref_dataarray = read_model_data( ref_filepaths, varname=varname ) dev_dataarray = read_model_data( dev_filepaths, varname=varname ) # Plot data vs observations plot_models_vs_obs( obs_dataframe, # pandas.DataFrame obs_label, # str obs_site_coords, # dict ref_dataarray, # xarray.DataArray ref_label, # str dev_dataarray, # xarray.DataArray dev_label, # str gc_levels, # pandas.DataFrame varname=varname, # str dst=dst, # str )