Auto-retry on transient failures + Run Now button for manual triggers

This commit is contained in:
2026-04-13 14:20:16 +00:00
parent d28143ec00
commit d29555ae53
3 changed files with 85 additions and 9 deletions
+38 -9
View File
@@ -3,9 +3,10 @@
import json
import os
import sys
import time
from datetime import datetime
from zoneinfo import ZoneInfo
from urllib import request
from urllib import request, error as urlerror
MT = ZoneInfo("America/Denver")
@@ -21,26 +22,54 @@ MONTH_NAMES = [
"July", "August", "September", "October", "November", "December",
]
# Retry config
DEFAULT_RETRIES = 3
DEFAULT_BACKOFF = 2 # seconds, doubles each retry
RETRIABLE_CODES = {408, 429, 500, 502, 503, 504}
def api_request(url, data=None, headers=None, method="GET"):
"""Simple HTTP helper using urllib."""
def api_request(url, data=None, headers=None, method="GET", retries=DEFAULT_RETRIES, backoff=DEFAULT_BACKOFF):
"""HTTP helper with automatic retry on transient failures."""
if data is not None:
data = json.dumps(data).encode("utf-8")
req = request.Request(url, data=data, headers=headers or {}, method=method)
if data:
req.add_header("Content-Type", "application/json")
with request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
last_error = None
for attempt in range(retries + 1):
try:
req = request.Request(url, data=data, headers=headers or {}, method=method)
if data:
req.add_header("Content-Type", "application/json")
with request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
except urlerror.HTTPError as e:
last_error = e
if e.code in RETRIABLE_CODES and attempt < retries:
wait = backoff * (2 ** attempt)
print(f" Retry {attempt + 1}/{retries} after {e.code} from {url} (waiting {wait}s)", file=sys.stderr)
time.sleep(wait)
continue
raise
except (urlerror.URLError, TimeoutError, ConnectionError, OSError) as e:
last_error = e
if attempt < retries:
wait = backoff * (2 ** attempt)
print(f" Retry {attempt + 1}/{retries} after {type(e).__name__} from {url} (waiting {wait}s)", file=sys.stderr)
time.sleep(wait)
continue
raise
raise last_error
def log_run(agent_id, status, output="", err="", metadata=None, instance_id=None):
"""Log a run to the dashboard API. Uses instance_id if available (v2), falls back to agent_id."""
"""Log a run to the dashboard API."""
try:
if instance_id:
api_request(
f"{DASHBOARD_API}/api/instances/{instance_id}/runs",
data={"status": status, "output": output, "error": err, "metadata": metadata or {}},
method="POST",
retries=1, # Don't retry logging too aggressively
)
else:
print(f"Warning: no instance_id, run not logged for {agent_id}", file=sys.stderr)