Source code for gcpy.benchmark.modules.benchmark_scrape_gchp_timers

#!/usr/bin/env python3
"""
Scrapes GCHP Classic benchmark timing information from one or
more text files.
"""
import os
import subprocess
import numpy as np
from gcpy.constants import ENCODING
from gcpy.util import make_directory, replace_whitespace, verify_variable_type



[docs]
def read_timing_data(input_files):
    """
    Parses the GEOS-Chem Classic timing information in JSON format
    and returns a dictionary with the results.

    Parameters
    ----------
    input_files : str or list
        Text file(s) to parse.

    Returns
    -------
    timing : list of dict
        Dictionary with timing information.
    """
    # Return value
    timing = []

    # If more than one file has been provided, read the timing
    # information and return a list of dictionaries with results
    if isinstance(input_files, list):
        for input_file in input_files:
            result = read_one_text_file(input_file)
            timing.append(result)
        return timing

    # If only one file has been provided, then read it
    # and return the dictionary in a list
    if isinstance(input_files, str):
        result = read_one_text_file(input_files)
        timing.append(result)
        return timing

    raise ValueError("Argument 'input_files' is not of type str or list!")




[docs]
def count_characters(text, char_to_match="."):
    """
    Returns the number of characters in a string of text.

    Parameters
    ----------
    text : str
        The text to parse.
    char_to_match : str, optional
        The character to look for in ``text``.

    Returns
    -------
    result : int
        Number of occurrences of ``char_to_match`` in ``text``.

    References
    ----------
    https://stackoverflow.com/questions/991350/counting-repeated-characters-in-a-string-in-python
    """
    # Create a dictionary where each character of "text"
    # is a key, and all values are set to zero.
    count = dict.fromkeys(text, 0)

    # Increment each time a character is found
    for char in text:
        count[char] += 1

    # Return the count of underscores
    if char_to_match not in count:
        return 0
    return count[char_to_match]




[docs]
def check_file_for_timing_info(text_file):
    """
    Checks if a given text file contains GCHP timers output.  If not,
    we will reset the file name to allPEs.log (in the same path).

    This update is necessary because MAPL v2.59 has switched the GCHP
    timers printout from the GCHP log file to allPEs.log.  This will
    allow backwards compatibility with output from GCHP simulations
    that use older MAPL versions.

    Parameters
    ----------
    text_file : str
        Name of the text file to check.

    Returns
    -------
    text_file : str
        Name of the text file containing GCHP timers output
        (either the input file or "allPEs.log").
    """
    result = subprocess.run(
        ['grep', 'Times for component <GCHPchem>', text_file],
        capture_output=True,
        text=True,
        check=False,
    )
    if len(result.stdout) == 0:
        text_file = os.path.join(
            os.path.dirname(text_file),
            "allPEs.log"
        )

    return text_file




[docs]
def read_one_text_file(text_file):
    """
    Parses the GCHP log file (plain text) with timing information
    and returns a dictionary with the results.

    Parameters
    ----------
    text_file : str
        Text file with timing information.

    Returns
    -------
    timers : dict
        Dictionary with timing information.
    """

    # Make sure file exists
    if not os.path.exists(text_file):
        raise FileNotFoundError(f"Could not find {text_file}!")

    # ==================================================================
    # Parse the GCHP log file
    # ==================================================================

    # Initialize local variables
    keep_line = False
    temp_timers = []
    inclusive = 0
    temp_timers = []

    # Check if the input file has GCHP timers, otherwise use "allPEs.log"
    text_file = check_file_for_timing_info(text_file)

    # Open the log file
    with open(text_file, encoding=ENCODING) as ifile:

        # Read each line in the file
        for line in ifile:

            # Strip out lines that are only present in "allPEs.log"
            line = line.replace("0000: GCHPctmEnv: INFO:", "")
            line = line.replace("0000: MAPL.profiler: INFO:", "")

            # Strip newlines
            line = line.strip()

            # Skip empty lines
            if len(line) == 0:
                continue

            # GCHPchem timers section (also skip header lines)
            if 'Times for component <GCHPchem>' in line:
                keep_line = True
                inclusive = 3
                continue
            if keep_line and 'Min                            Mean' in line:
                continue
            if keep_line and '============================' in line:
                continue
            if keep_line and 'Name                          %' in line:
                continue
            if keep_line and '------ ---------- ----------' in line:
                continue
            if keep_line and '---------------------------------' in line:
                keep_line = False
                continue

            # Skip everything that follows GCHPchem until the
            # summary timers section
            if "Times for component <DYNAMICS>" in line:
                keep_line = False

            # Summary section (also skip header lines)
            if 'Report on process:' in line:
                keep_line = True
                inclusive = 2
                continue
            if keep_line and 'Inclusive' in line:
                continue
            if keep_line and '================' in line:
                continue
            if keep_line and 'Name' in line:
                continue
            if keep_line and '-------- --------- ------ --------- ------' \
               in line:
                continue

            # This line appears in GCHP log files using MAPL
            # versions prior to 2.59.  It indicates the end of the
            # GCHP timers section.  Exit when we encounter this.
            if keep_line and "++" in line:
                break

            # Append timing info lines into a list of dicts
            if keep_line:
                substr = line.split()
                key = substr[0].strip()
                val = float(substr[inclusive].strip())
                temp_timers.append({key: val})

    # ==================================================================
    # Save timing results into a "flattened" dictionary
    # ==================================================================
    hdr = ["", "", "", "", ""]
    timers = {}
    for timer in temp_timers:
        for (key, val) in timer.items():

            # Denote how deep into the dictionary this key goes
            # as determined by the number of prefixing "-" characters
            depth = count_characters(key, "-") / 2

            # Remove any prefixed "-" characters
            new_key = key.strip("-")

            # Add results into the "timers" dictionary as a
            # "flattened" dictionary, for expediency
            # (This is the only way to update a nested dict)
            if depth == 0:
                hdr[0] = new_key
                timers[new_key] = val
            elif depth == 1:
                hdr[1] = new_key
                new_key = f"{hdr[0]}.{new_key}"
                timers[new_key] = val
            elif depth == 2:
                hdr[2] = new_key
                new_key = f"{hdr[0]}.{hdr[1]}.{new_key}"
                timers[new_key] = val
            elif depth == 3:
                hdr[3] = new_key
                new_key = f"{hdr[0]}.{hdr[1]}.{hdr[2]}.{new_key}"
                timers[new_key] = val
            elif depth == 4:
                hdr[4] = new_key
                new_key = f"{hdr[0]}.{hdr[1]}.{hdr[2]}.{hdr[3]}.{new_key}"
                timers[new_key] = val
            else:
                new_key = \
                    f"{hdr[0]}.{hdr[1]}.{hdr[2]}.{hdr[3]}.{hdr[4]}.{new_key}"
                timers[new_key] = val

    return timers




[docs]
def sum_timers(timers):
    """
    Sums the time in seconds for each GEOS-Chem timer.  Input may be
    a single dict with timing information or a list of dicts.

    Parameters
    ----------
    timers : dict or list
        GCHP timing information from one or more log files in
        plain text format.

    Returns
    -------
    result : dict
        Sum of timing information.
    """

    # If timers is of type dict, no summing is needed.
    if isinstance(timers, dict):
        return timers

    # If timers is a list of dicts, sum the times
    # in seconds into a new dict, and then return.
    if isinstance(timers, list):

        # Initialize the result dict
        result = {}
        for timer in timers:
            for (key, val) in timer.items():
                result[key] = 0.0

        # Then sum the time in seconds for each timer
        for timer in timers:
            for (key, val) in timer.items():
                result[key] += float(val)

        return result

    raise ValueError("Argument 'timers' must be of type str or dict!")




[docs]
def print_timer(key, ref, dev, ofile):
    """
    Prints timing info for a single timer to a log file.

    Parameters
    ----------
    key : str
        Dictionary key to print.
    ref : dict
        Timing information from the "Ref" model.
    dev : dict
        Timing information from the "Dev" model.
    ofile : file
        File object where info will be written.
    """
    # Denote the level of the dictionary key by counting "." chars
    depth = count_characters(key, ".")

    # Prefix "--" characters to the end of the key to denote depth
    # to replicate the label style at the end of the GCHP log file
    label = "--"*depth + key.split(".")[-1]

    # Line to print
    pctdiff = np.nan
    if np.abs(ref[key] > 0.0):
        pctdiff = ((dev[key] - ref[key]) / ref[key]) * 100.0
    line = \
        f"{label:<22}  {ref[key]:>18.3f}  {dev[key]:>18.3f}   {pctdiff:>12.3f}"
    if np.abs(pctdiff) >= 10.0:  # Flag diffs > +/- 10%
        line += " *"
    print(line, file=ofile)




[docs]
def display_timers(ref, ref_label, dev, dev_label, table_file):
    """
    Prints the GCHP timer information to a table.

    Parameters
    ----------
    ref : dict
        Timing information from the "Ref" model.
    ref_label : str
        Version string for the "Ref" model.
    dev : dict
        Timing information from the "Dev" model.
    dev_label : str
        Version string for the "Dev" model.
    table_file : str
        File name for the timing table output.
    """
    with open(table_file, "w", encoding=ENCODING) as ofile:

        # Print header
        print("%"*79, file=ofile)
        print("%%% GCHP Benchmark Timing Information", file=ofile)
        print("%%%", file=ofile)
        print(f"%%% Ref = {ref_label}", file=ofile)
        print(f"%%% Dev = {dev_label}", file=ofile)
        print("%"*79, file=ofile)

        # GCHPchem timers
        print("\n", file=ofile)
        print(f"{'GCHPchem Timer':<22}  {'Ref [s]':>18}  {'Dev [s]':>18}   {'% Diff':>12}", file=ofile)
        print("-"*79, file=ofile)
        for key in dev:
            if key.startswith("GCHPchem"):
                print_timer(key, ref, dev, ofile)

        # Summary timers
        print("\n", file=ofile)
        print(f"{'Summary':<22}  {'Ref [s]':>18}  {'Dev [s]':>18}   {'% Diff':>12}", file=ofile)
        print("-"*79, file=ofile)
        for key in dev:
            if key.startswith("All"):
                print_timer(key, ref, dev, ofile)




[docs]
def make_benchmark_gchp_timing_table(
        ref_files,
        ref_label,
        dev_files,
        dev_label,
        dst="./benchmark",
        overwrite=False,
):
    """
    Creates a table of timing information for GCHP benchmark
    simulations given one or more text files as input.

    Parameters
    ----------
    ref_files : str or list
        File(s) with timing info from the "Ref" model.
    ref_label : str
        Version string for the "Ref" model.
    dev_files : str or list
        File(s) with timing info from the "Dev" model.
    dev_label : str
        Version string for the "Dev" model.
    dst : str, optional
        Directory where output will be written.
    overwrite : bool, optional
        Overwrite existing files? Default: False.
    """
    verify_variable_type(ref_files, (str, list))
    verify_variable_type(ref_label, str)
    verify_variable_type(dev_files, (str, list))
    verify_variable_type(dev_label, str)
    verify_variable_type(dst, str)

    # Create the destination folder
    make_directory(dst, overwrite)

    # Replace whitespace in the ref and dev labels
    ref_label = replace_whitespace(ref_label)
    dev_label = replace_whitespace(dev_label)

    # Strip timing info from JSON/text file(s) and sum the them.
    ref_timers = sum_timers(read_timing_data(ref_files))
    dev_timers = sum_timers(read_timing_data(dev_files))

    # Filename for output
    timing_table = replace_whitespace(
        os.path.join(
            dst,
            f"Benchmark_Timers_{ref_label}_vs_{dev_label}.txt"
        )
    )

    # Write timing info to a table
    display_timers(
        ref_timers,
        replace_whitespace(ref_label),
        dev_timers,
        replace_whitespace(dev_label),
        timing_table,
    )