Source code for gcpy.benchmark.modules.benchmark_scrape_gchp_timers

#!/usr/bin/env python3
"""
Scrapes GCHP Classic benchmark timing information from one or
more text files.
"""
import os
import subprocess
import numpy as np
from gcpy.constants import ENCODING
from gcpy.util import make_directory, replace_whitespace, verify_variable_type


[docs] def read_timing_data(input_files): """ Parses the GEOS-Chem Classic timing information in JSON format and returns a dictionary with the results. Parameters ---------- input_files : str or list Text file(s) to parse. Returns ------- timing : list of dict Dictionary with timing information. """ # Return value timing = [] # If more than one file has been provided, read the timing # information and return a list of dictionaries with results if isinstance(input_files, list): for input_file in input_files: result = read_one_text_file(input_file) timing.append(result) return timing # If only one file has been provided, then read it # and return the dictionary in a list if isinstance(input_files, str): result = read_one_text_file(input_files) timing.append(result) return timing raise ValueError("Argument 'input_files' is not of type str or list!")
[docs] def count_characters(text, char_to_match="."): """ Returns the number of characters in a string of text. Parameters ---------- text : str The text to parse. char_to_match : str, optional The character to look for in ``text``. Returns ------- result : int Number of occurrences of ``char_to_match`` in ``text``. References ---------- https://stackoverflow.com/questions/991350/counting-repeated-characters-in-a-string-in-python """ # Create a dictionary where each character of "text" # is a key, and all values are set to zero. count = dict.fromkeys(text, 0) # Increment each time a character is found for char in text: count[char] += 1 # Return the count of underscores if char_to_match not in count: return 0 return count[char_to_match]
[docs] def check_file_for_timing_info(text_file): """ Checks if a given text file contains GCHP timers output. If not, we will reset the file name to allPEs.log (in the same path). This update is necessary because MAPL v2.59 has switched the GCHP timers printout from the GCHP log file to allPEs.log. This will allow backwards compatibility with output from GCHP simulations that use older MAPL versions. Parameters ---------- text_file : str Name of the text file to check. Returns ------- text_file : str Name of the text file containing GCHP timers output (either the input file or "allPEs.log"). """ result = subprocess.run( ['grep', 'Times for component <GCHPchem>', text_file], capture_output=True, text=True, check=False, ) if len(result.stdout) == 0: text_file = os.path.join( os.path.dirname(text_file), "allPEs.log" ) return text_file
[docs] def read_one_text_file(text_file): """ Parses the GCHP log file (plain text) with timing information and returns a dictionary with the results. Parameters ---------- text_file : str Text file with timing information. Returns ------- timers : dict Dictionary with timing information. """ # Make sure file exists if not os.path.exists(text_file): raise FileNotFoundError(f"Could not find {text_file}!") # ================================================================== # Parse the GCHP log file # ================================================================== # Initialize local variables keep_line = False temp_timers = [] inclusive = 0 temp_timers = [] # Check if the input file has GCHP timers, otherwise use "allPEs.log" text_file = check_file_for_timing_info(text_file) # Open the log file with open(text_file, encoding=ENCODING) as ifile: # Read each line in the file for line in ifile: # Strip out lines that are only present in "allPEs.log" line = line.replace("0000: GCHPctmEnv: INFO:", "") line = line.replace("0000: MAPL.profiler: INFO:", "") # Strip newlines line = line.strip() # Skip empty lines if len(line) == 0: continue # GCHPchem timers section (also skip header lines) if 'Times for component <GCHPchem>' in line: keep_line = True inclusive = 3 continue if keep_line and 'Min Mean' in line: continue if keep_line and '============================' in line: continue if keep_line and 'Name %' in line: continue if keep_line and '------ ---------- ----------' in line: continue if keep_line and '---------------------------------' in line: keep_line = False continue # Skip everything that follows GCHPchem until the # summary timers section if "Times for component <DYNAMICS>" in line: keep_line = False # Summary section (also skip header lines) if 'Report on process:' in line: keep_line = True inclusive = 2 continue if keep_line and 'Inclusive' in line: continue if keep_line and '================' in line: continue if keep_line and 'Name' in line: continue if keep_line and '-------- --------- ------ --------- ------' \ in line: continue # This line appears in GCHP log files using MAPL # versions prior to 2.59. It indicates the end of the # GCHP timers section. Exit when we encounter this. if keep_line and "++" in line: break # Append timing info lines into a list of dicts if keep_line: substr = line.split() key = substr[0].strip() val = float(substr[inclusive].strip()) temp_timers.append({key: val}) # ================================================================== # Save timing results into a "flattened" dictionary # ================================================================== hdr = ["", "", "", "", ""] timers = {} for timer in temp_timers: for (key, val) in timer.items(): # Denote how deep into the dictionary this key goes # as determined by the number of prefixing "-" characters depth = count_characters(key, "-") / 2 # Remove any prefixed "-" characters new_key = key.strip("-") # Add results into the "timers" dictionary as a # "flattened" dictionary, for expediency # (This is the only way to update a nested dict) if depth == 0: hdr[0] = new_key timers[new_key] = val elif depth == 1: hdr[1] = new_key new_key = f"{hdr[0]}.{new_key}" timers[new_key] = val elif depth == 2: hdr[2] = new_key new_key = f"{hdr[0]}.{hdr[1]}.{new_key}" timers[new_key] = val elif depth == 3: hdr[3] = new_key new_key = f"{hdr[0]}.{hdr[1]}.{hdr[2]}.{new_key}" timers[new_key] = val elif depth == 4: hdr[4] = new_key new_key = f"{hdr[0]}.{hdr[1]}.{hdr[2]}.{hdr[3]}.{new_key}" timers[new_key] = val else: new_key = \ f"{hdr[0]}.{hdr[1]}.{hdr[2]}.{hdr[3]}.{hdr[4]}.{new_key}" timers[new_key] = val return timers
[docs] def sum_timers(timers): """ Sums the time in seconds for each GEOS-Chem timer. Input may be a single dict with timing information or a list of dicts. Parameters ---------- timers : dict or list GCHP timing information from one or more log files in plain text format. Returns ------- result : dict Sum of timing information. """ # If timers is of type dict, no summing is needed. if isinstance(timers, dict): return timers # If timers is a list of dicts, sum the times # in seconds into a new dict, and then return. if isinstance(timers, list): # Initialize the result dict result = {} for timer in timers: for (key, val) in timer.items(): result[key] = 0.0 # Then sum the time in seconds for each timer for timer in timers: for (key, val) in timer.items(): result[key] += float(val) return result raise ValueError("Argument 'timers' must be of type str or dict!")
[docs] def display_timers(ref, ref_label, dev, dev_label, table_file): """ Prints the GCHP timer information to a table. Parameters ---------- ref : dict Timing information from the "Ref" model. ref_label : str Version string for the "Ref" model. dev : dict Timing information from the "Dev" model. dev_label : str Version string for the "Dev" model. table_file : str File name for the timing table output. """ with open(table_file, "w", encoding=ENCODING) as ofile: # Print header print("%"*79, file=ofile) print("%%% GCHP Benchmark Timing Information", file=ofile) print("%%%", file=ofile) print(f"%%% Ref = {ref_label}", file=ofile) print(f"%%% Dev = {dev_label}", file=ofile) print("%"*79, file=ofile) # GCHPchem timers print("\n", file=ofile) print(f"{'GCHPchem Timer':<22} {'Ref [s]':>18} {'Dev [s]':>18} {'% Diff':>12}", file=ofile) print("-"*79, file=ofile) for key in dev: if key.startswith("GCHPchem"): print_timer(key, ref, dev, ofile) # Summary timers print("\n", file=ofile) print(f"{'Summary':<22} {'Ref [s]':>18} {'Dev [s]':>18} {'% Diff':>12}", file=ofile) print("-"*79, file=ofile) for key in dev: if key.startswith("All"): print_timer(key, ref, dev, ofile)
[docs] def make_benchmark_gchp_timing_table( ref_files, ref_label, dev_files, dev_label, dst="./benchmark", overwrite=False, ): """ Creates a table of timing information for GCHP benchmark simulations given one or more text files as input. Parameters ---------- ref_files : str or list File(s) with timing info from the "Ref" model. ref_label : str Version string for the "Ref" model. dev_files : str or list File(s) with timing info from the "Dev" model. dev_label : str Version string for the "Dev" model. dst : str, optional Directory where output will be written. overwrite : bool, optional Overwrite existing files? Default: False. """ verify_variable_type(ref_files, (str, list)) verify_variable_type(ref_label, str) verify_variable_type(dev_files, (str, list)) verify_variable_type(dev_label, str) verify_variable_type(dst, str) # Create the destination folder make_directory(dst, overwrite) # Replace whitespace in the ref and dev labels ref_label = replace_whitespace(ref_label) dev_label = replace_whitespace(dev_label) # Strip timing info from JSON/text file(s) and sum the them. ref_timers = sum_timers(read_timing_data(ref_files)) dev_timers = sum_timers(read_timing_data(dev_files)) # Filename for output timing_table = replace_whitespace( os.path.join( dst, f"Benchmark_Timers_{ref_label}_vs_{dev_label}.txt" ) ) # Write timing info to a table display_timers( ref_timers, replace_whitespace(ref_label), dev_timers, replace_whitespace(dev_label), timing_table, )