Auto-retry on transient failures + Run Now button for manual triggers
This commit is contained in:
+38
-9
@@ -3,9 +3,10 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from zoneinfo import ZoneInfo
|
||||
from urllib import request
|
||||
from urllib import request, error as urlerror
|
||||
|
||||
MT = ZoneInfo("America/Denver")
|
||||
|
||||
@@ -21,26 +22,54 @@ MONTH_NAMES = [
|
||||
"July", "August", "September", "October", "November", "December",
|
||||
]
|
||||
|
||||
# Retry config
|
||||
DEFAULT_RETRIES = 3
|
||||
DEFAULT_BACKOFF = 2 # seconds, doubles each retry
|
||||
RETRIABLE_CODES = {408, 429, 500, 502, 503, 504}
|
||||
|
||||
def api_request(url, data=None, headers=None, method="GET"):
|
||||
"""Simple HTTP helper using urllib."""
|
||||
|
||||
def api_request(url, data=None, headers=None, method="GET", retries=DEFAULT_RETRIES, backoff=DEFAULT_BACKOFF):
|
||||
"""HTTP helper with automatic retry on transient failures."""
|
||||
if data is not None:
|
||||
data = json.dumps(data).encode("utf-8")
|
||||
req = request.Request(url, data=data, headers=headers or {}, method=method)
|
||||
if data:
|
||||
req.add_header("Content-Type", "application/json")
|
||||
with request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
last_error = None
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
req = request.Request(url, data=data, headers=headers or {}, method=method)
|
||||
if data:
|
||||
req.add_header("Content-Type", "application/json")
|
||||
with request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
except urlerror.HTTPError as e:
|
||||
last_error = e
|
||||
if e.code in RETRIABLE_CODES and attempt < retries:
|
||||
wait = backoff * (2 ** attempt)
|
||||
print(f" Retry {attempt + 1}/{retries} after {e.code} from {url} (waiting {wait}s)", file=sys.stderr)
|
||||
time.sleep(wait)
|
||||
continue
|
||||
raise
|
||||
except (urlerror.URLError, TimeoutError, ConnectionError, OSError) as e:
|
||||
last_error = e
|
||||
if attempt < retries:
|
||||
wait = backoff * (2 ** attempt)
|
||||
print(f" Retry {attempt + 1}/{retries} after {type(e).__name__} from {url} (waiting {wait}s)", file=sys.stderr)
|
||||
time.sleep(wait)
|
||||
continue
|
||||
raise
|
||||
|
||||
raise last_error
|
||||
|
||||
|
||||
def log_run(agent_id, status, output="", err="", metadata=None, instance_id=None):
|
||||
"""Log a run to the dashboard API. Uses instance_id if available (v2), falls back to agent_id."""
|
||||
"""Log a run to the dashboard API."""
|
||||
try:
|
||||
if instance_id:
|
||||
api_request(
|
||||
f"{DASHBOARD_API}/api/instances/{instance_id}/runs",
|
||||
data={"status": status, "output": output, "error": err, "metadata": metadata or {}},
|
||||
method="POST",
|
||||
retries=1, # Don't retry logging too aggressively
|
||||
)
|
||||
else:
|
||||
print(f"Warning: no instance_id, run not logged for {agent_id}", file=sys.stderr)
|
||||
|
||||
Reference in New Issue
Block a user