mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Pull Request resolved: https://github.com/pytorch/pytorch/pull/129375 Approved by: https://github.com/malfet
193 lines
6.9 KiB
Python
193 lines
6.9 KiB
Python
from __future__ import annotations
|
|
|
|
import datetime
|
|
import inspect
|
|
import os
|
|
import time
|
|
import uuid
|
|
from decimal import Decimal
|
|
from typing import Any
|
|
from warnings import warn
|
|
|
|
|
|
# boto3 is an optional dependency. If it's not installed,
|
|
# we'll just not emit the metrics.
|
|
# Keeping this logic here so that callers don't have to
|
|
# worry about it.
|
|
EMIT_METRICS = False
|
|
try:
|
|
import boto3 # type: ignore[import]
|
|
|
|
EMIT_METRICS = True
|
|
except ImportError as e:
|
|
print(f"Unable to import boto3. Will not be emitting metrics.... Reason: {e}")
|
|
|
|
# Sometimes our runner machines are located in one AWS account while the metrics table may be in
|
|
# another, so we need to specify the table's ARN explicitly.
|
|
TORCHCI_METRICS_TABLE_ARN = (
|
|
"arn:aws:dynamodb:us-east-1:308535385114:table/torchci-metrics"
|
|
)
|
|
|
|
|
|
class EnvVarMetric:
|
|
name: str
|
|
env_var: str
|
|
required: bool = True
|
|
# Used to cast the value of the env_var to the correct type (defaults to str)
|
|
type_conversion_fn: Any = None
|
|
|
|
def __init__(
|
|
self,
|
|
name: str,
|
|
env_var: str,
|
|
required: bool = True,
|
|
type_conversion_fn: Any = None,
|
|
) -> None:
|
|
self.name = name
|
|
self.env_var = env_var
|
|
self.required = required
|
|
self.type_conversion_fn = type_conversion_fn
|
|
|
|
def value(self) -> Any:
|
|
value = os.environ.get(self.env_var)
|
|
|
|
# Github CI will set some env vars to an empty string
|
|
DEFAULT_ENVVAR_VALUES = [None, ""]
|
|
if value in DEFAULT_ENVVAR_VALUES:
|
|
if not self.required:
|
|
return None
|
|
|
|
raise ValueError(
|
|
f"Missing {self.name}. Please set the {self.env_var} "
|
|
"environment variable to pass in this value."
|
|
)
|
|
|
|
if self.type_conversion_fn:
|
|
return self.type_conversion_fn(value)
|
|
return value
|
|
|
|
|
|
global_metrics: dict[str, Any] = {}
|
|
|
|
|
|
def add_global_metric(metric_name: str, metric_value: Any) -> None:
|
|
"""
|
|
Adds stats that should be emitted with every metric by the current process.
|
|
If the emit_metrics method specifies a metric with the same name, it will
|
|
overwrite this value.
|
|
"""
|
|
global_metrics[metric_name] = metric_value
|
|
|
|
|
|
def emit_metric(
|
|
metric_name: str,
|
|
metrics: dict[str, Any],
|
|
) -> None:
|
|
"""
|
|
Upload a metric to DynamoDB (and from there, Rockset).
|
|
|
|
Even if EMIT_METRICS is set to False, this function will still run the code to
|
|
validate and shape the metrics, skipping just the upload.
|
|
|
|
Parameters:
|
|
metric_name:
|
|
Name of the metric. Every unique metric should have a different name
|
|
and be emitted just once per run attempt.
|
|
Metrics are namespaced by their module and the function that emitted them.
|
|
metrics: The actual data to record.
|
|
|
|
Some default values are populated from environment variables, which must be set
|
|
for metrics to be emitted. (If they're not set, this function becomes a noop):
|
|
"""
|
|
|
|
if metrics is None:
|
|
raise ValueError("You didn't ask to upload any metrics!")
|
|
|
|
# Merge the given metrics with the global metrics, overwriting any duplicates
|
|
# with the given metrics.
|
|
metrics = {**global_metrics, **metrics}
|
|
|
|
# We use these env vars that to determine basic info about the workflow run.
|
|
# By using env vars, we don't have to pass this info around to every function.
|
|
# It also helps ensure that we only emit metrics during CI
|
|
env_var_metrics = [
|
|
EnvVarMetric("repo", "GITHUB_REPOSITORY"),
|
|
EnvVarMetric("workflow", "GITHUB_WORKFLOW"),
|
|
EnvVarMetric("build_environment", "BUILD_ENVIRONMENT", required=False),
|
|
EnvVarMetric("job", "GITHUB_JOB"),
|
|
EnvVarMetric("test_config", "TEST_CONFIG", required=False),
|
|
EnvVarMetric("pr_number", "PR_NUMBER", required=False, type_conversion_fn=int),
|
|
EnvVarMetric("run_id", "GITHUB_RUN_ID", type_conversion_fn=int),
|
|
EnvVarMetric("run_number", "GITHUB_RUN_NUMBER", type_conversion_fn=int),
|
|
EnvVarMetric("run_attempt", "GITHUB_RUN_ATTEMPT", type_conversion_fn=int),
|
|
EnvVarMetric("job_id", "JOB_ID", type_conversion_fn=int),
|
|
EnvVarMetric("job_name", "JOB_NAME"),
|
|
]
|
|
|
|
# Use info about the function that invoked this one as a namespace and a way to filter metrics.
|
|
calling_frame = inspect.currentframe().f_back # type: ignore[union-attr]
|
|
calling_frame_info = inspect.getframeinfo(calling_frame) # type: ignore[arg-type]
|
|
calling_file = os.path.basename(calling_frame_info.filename)
|
|
calling_module = inspect.getmodule(calling_frame).__name__ # type: ignore[union-attr]
|
|
calling_function = calling_frame_info.function
|
|
|
|
try:
|
|
reserved_metrics = {
|
|
"metric_name": metric_name,
|
|
"calling_file": calling_file,
|
|
"calling_module": calling_module,
|
|
"calling_function": calling_function,
|
|
"timestamp": datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f"),
|
|
**{m.name: m.value() for m in env_var_metrics if m.value()},
|
|
}
|
|
except ValueError as e:
|
|
warn(f"Not emitting metrics for {metric_name}. {e}")
|
|
return
|
|
|
|
# Prefix key with metric name and timestamp to derisk chance of a uuid1 name collision
|
|
reserved_metrics[
|
|
"dynamo_key"
|
|
] = f"{metric_name}_{int(time.time())}_{uuid.uuid1().hex}"
|
|
|
|
# Ensure the metrics dict doesn't contain any reserved keys
|
|
for key in reserved_metrics.keys():
|
|
used_reserved_keys = [k for k in metrics.keys() if k == key]
|
|
if used_reserved_keys:
|
|
raise ValueError(f"Metrics dict contains reserved keys: [{', '.join(key)}]")
|
|
|
|
# boto3 doesn't support uploading float values to DynamoDB, so convert them all to decimals.
|
|
metrics = _convert_float_values_to_decimals(metrics)
|
|
|
|
if EMIT_METRICS:
|
|
try:
|
|
session = boto3.Session(region_name="us-east-1")
|
|
session.resource("dynamodb").Table(TORCHCI_METRICS_TABLE_ARN).put_item(
|
|
Item={
|
|
**reserved_metrics,
|
|
**metrics,
|
|
}
|
|
)
|
|
except Exception as e:
|
|
# We don't want to fail the job if we can't upload the metric.
|
|
# We still raise the ValueErrors outside this try block since those indicate improperly configured metrics
|
|
warn(f"Error uploading metric {metric_name} to DynamoDB: {e}")
|
|
return
|
|
else:
|
|
print(f"Not emitting metrics for {metric_name}. Boto wasn't imported.")
|
|
|
|
|
|
def _convert_float_values_to_decimals(data: dict[str, Any]) -> dict[str, Any]:
|
|
# Attempt to recurse
|
|
def _helper(o: Any) -> Any:
|
|
if isinstance(o, float):
|
|
return Decimal(str(o))
|
|
if isinstance(o, list):
|
|
return [_helper(v) for v in o]
|
|
if isinstance(o, dict):
|
|
return {_helper(k): _helper(v) for k, v in o.items()}
|
|
if isinstance(o, tuple):
|
|
return tuple(_helper(v) for v in o)
|
|
return o
|
|
|
|
return {k: _helper(v) for k, v in data.items()}
|