#!/usr/bin/env python3
r"""
Script to scrape statistics from a 1-month GEOS-Chem Classic benchmark run,
which can then be placed in the "GEOS-Chem 1-month Benchmark Stats"
Google spreadsheet.
Examples
--------
.. code-block:: console
$ conda activate gcpy_env
$ python -m gcpy.benchmark.modules.benchmark_scrape_gcclassic_stats \
14.5.0-alpha.5 \
14.5.0-alpha.6
"""
import sys
import requests
from gcpy.util import replace_whitespace, verify_variable_type
# ----------------------------------------------------------------------
# Global variables
# ----------------------------------------------------------------------
ROOT = "https://s3.amazonaws.com/benchmarks-cloud"
LOG_TEMPLATE = f"{ROOT}/benchmarks/1Mon/gcc/ID/RunGCC.txt"
METRICS_TEMPLATE = f"{ROOT}/diff-plots/1Mon/ID/BenchmarkResults/Tables/OH_metrics.txt"
TIMERS = [
"GEOS-Chem :",
"HEMCO :",
"=> Gas-phase chem :",
"=> Photolysis :",
"=> Aerosol chem :",
"=> Linearized chem :",
"Transport :",
"Convection :",
"Boundary layer mixing :",
"Dry deposition :",
"Wet deposition :",
"Diagnostics :",
"Unit conversions :",
]
# ----------------------------------------------------------------------
# Functions
# ----------------------------------------------------------------------
[docs]
def print_stats(stats):
"""
Prints OH metrics and timing statistics.
Parameters
----------
stats : dict
Dictionary with statistics to print.
"""
# Time and memory
line = f"{stats['Wall Time']},,,{stats['Memory']},"
# OH metrics
line += f"{stats['Mean OH']},,{stats['CH3CCl3']},{stats['CH4']},,"
# Timers
timers = TIMERS
for timer in timers:
timer = format_timer(timer.split(":", maxsplit=1)[0])
line += f"{stats[timer]},"
print(line)
[docs]
def parse_timer(timer):
"""
Extracts the timer name and time in seconds from the given text.
Parameters
----------
timer : str
Line of text with GEOS-Chem Classic timing output.
"""
sub_strings = timer.split(":")
timer = format_timer(sub_strings[0])
seconds = sub_strings[3].split()[1].strip()
return timer, seconds
[docs]
def scrape_stats(text):
"""
Extracts timing statistics and OH metrics from the given text.
Parameters
----------
text : str
Text scraped from the log file and metrics file.
"""
# Copy global variable to local for efficiency
timers = TIMERS
# Define empty dictionary for output and a counter
stats = {}
line_count = 0
# Read the text backwards since the timers and OH are at the end
for line in reversed(text.splitlines()):
# Skip reading the rest of the file once we have
# found the start of the timers section
if "G E O S - C H E M T I M E R S" in line:
break
# Look for the various metrics
if line_count == 2 and "Dev" in line:
stats["CH4"] = line.split(":")[1].strip()
if line_count == 10 and "Dev" in line:
stats["CH3CCl3"] = line.split(":")[1].strip()
if line_count == 18 and "Dev" in line:
stats["Mean OH"] = line.split(":")[1].strip()
# Skip commands
if "++ sed" in line:
line_count += 1
continue
# Wall time
if "wall clock" in line:
stats["Wall Time"] = line.split("m:ss):")[1].strip()
# Memory (GB)
if "Maximum resident set size" in line:
stats["Memory"] = str(float(line.split(":")[1]) / 1.0e6).strip()
# GEOS-Chem Classic timers
for timer in timers:
if timer in line:
timer, seconds = parse_timer(line)
stats[timer] = str(round(float(seconds)))
# Increment counter
line_count += 1
return stats
[docs]
def get_text_from_web(url):
"""
Returns the text from a file located on the web.
Parameters
----------
url : str
URL of the file to be parsed.
"""
try:
text = requests.get(url, timeout=10).text
except FileNotFoundError as exc:
err_msg = f"Could not download {url} from AWS!"
raise FileNotFoundError(err_msg) from exc
return text
[docs]
def main(ref_label, dev_label):
"""
Main program. Given the labels from two benchmark simulations
(ref and dev), downloads the relevant files from AWS and passes
the text to function "scrape_info" where it will be analyzed.
Parameters
----------
ref_label : str
Label for the Ref version.
dev_label : str
Label for the Dev version.
"""
verify_variable_type(ref_label, str)
verify_variable_type(dev_label, str)
# Replace whitespace in the ref and dev labels
ref_label = replace_whitespace(ref_label)
dev_label = replace_whitespace(dev_label)
# Scrape the log file text into a variable
bmk_id = f"gcc-4x5-1Mon-{dev_label}"
text = get_text_from_web(LOG_TEMPLATE.replace("ID", bmk_id))
# Append the metrics file text
bmk_id = f"diff-gcc-4x5-1Mon-{ref_label}-gcc-4x5-1Mon-{dev_label}"
text += get_text_from_web(METRICS_TEMPLATE.replace("ID", bmk_id))
# Scrape the relevant statistics from the text and print to stdout
stats = scrape_stats(text)
print_stats(stats)
# ----------------------------------------------------------------------
# For use from the command line
# ----------------------------------------------------------------------
if __name__ == '__main__':
if len(sys.argv) != 3:
ERR_MSG = "Usage: stats.py REF-LABEL DEV-LABEL"
raise ValueError(ERR_MSG)
main(sys.argv[1], sys.argv[2])